Model: ali-elganzory/open-sci-ref-v0.02-1.7b-nemotron-hq-300B-16k-SFT-Tulu3-decontaminated Source: Original Platform
14677 lines
408 KiB
JSON
14677 lines
408 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 14634,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.47578125,
|
|
"epoch": 0.0013666803334700013,
|
|
"grad_norm": 0.33593242457115113,
|
|
"learning_rate": 1.0227272727272728e-07,
|
|
"loss": 1.5739,
|
|
"mean_token_accuracy": 0.6472275972366333,
|
|
"num_tokens": 938571.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.4359375,
|
|
"epoch": 0.0027333606669400026,
|
|
"grad_norm": 0.34553589324619793,
|
|
"learning_rate": 2.1590909090909094e-07,
|
|
"loss": 1.5121,
|
|
"mean_token_accuracy": 0.6633087396621704,
|
|
"num_tokens": 1829597.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.45703125,
|
|
"epoch": 0.004100041000410004,
|
|
"grad_norm": 0.4047460490867157,
|
|
"learning_rate": 3.2954545454545455e-07,
|
|
"loss": 1.5502,
|
|
"mean_token_accuracy": 0.6545426428318024,
|
|
"num_tokens": 2756097.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.5046875,
|
|
"epoch": 0.005466721333880005,
|
|
"grad_norm": 0.33029629677687194,
|
|
"learning_rate": 4.431818181818182e-07,
|
|
"loss": 1.5706,
|
|
"mean_token_accuracy": 0.6511855959892273,
|
|
"num_tokens": 3667157.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.440625,
|
|
"epoch": 0.006833401667350007,
|
|
"grad_norm": 0.3422623157489635,
|
|
"learning_rate": 5.568181818181818e-07,
|
|
"loss": 1.5163,
|
|
"mean_token_accuracy": 0.6594492614269256,
|
|
"num_tokens": 4593212.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.4875,
|
|
"epoch": 0.008200082000820008,
|
|
"grad_norm": 0.42445052030444796,
|
|
"learning_rate": 6.704545454545456e-07,
|
|
"loss": 1.5504,
|
|
"mean_token_accuracy": 0.6537157356739044,
|
|
"num_tokens": 5559539.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.5015625,
|
|
"epoch": 0.00956676233429001,
|
|
"grad_norm": 0.34257984944166675,
|
|
"learning_rate": 7.840909090909092e-07,
|
|
"loss": 1.5794,
|
|
"mean_token_accuracy": 0.6488156080245971,
|
|
"num_tokens": 6518318.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.5453125,
|
|
"epoch": 0.01093344266776001,
|
|
"grad_norm": 0.37454168685988903,
|
|
"learning_rate": 8.977272727272728e-07,
|
|
"loss": 1.6236,
|
|
"mean_token_accuracy": 0.6418857753276825,
|
|
"num_tokens": 7462414.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.4703125,
|
|
"epoch": 0.012300123001230012,
|
|
"grad_norm": 0.3236560405224948,
|
|
"learning_rate": 1.0113636363636365e-06,
|
|
"loss": 1.5436,
|
|
"mean_token_accuracy": 0.6538457691669464,
|
|
"num_tokens": 8328147.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.4625,
|
|
"epoch": 0.013666803334700014,
|
|
"grad_norm": 0.3415867567027531,
|
|
"learning_rate": 1.125e-06,
|
|
"loss": 1.5479,
|
|
"mean_token_accuracy": 0.6575356006622315,
|
|
"num_tokens": 9256532.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.45546875,
|
|
"epoch": 0.015033483668170014,
|
|
"grad_norm": 0.325791412830201,
|
|
"learning_rate": 1.2386363636363638e-06,
|
|
"loss": 1.5202,
|
|
"mean_token_accuracy": 0.6603422462940216,
|
|
"num_tokens": 10191350.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.45625,
|
|
"epoch": 0.016400164001640016,
|
|
"grad_norm": 0.31612748192848017,
|
|
"learning_rate": 1.3522727272727273e-06,
|
|
"loss": 1.5316,
|
|
"mean_token_accuracy": 0.6589971661567688,
|
|
"num_tokens": 11096376.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.45390625,
|
|
"epoch": 0.017766844335110017,
|
|
"grad_norm": 0.3177743645645549,
|
|
"learning_rate": 1.465909090909091e-06,
|
|
"loss": 1.5169,
|
|
"mean_token_accuracy": 0.6602472007274628,
|
|
"num_tokens": 12025075.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.46484375,
|
|
"epoch": 0.01913352466858002,
|
|
"grad_norm": 0.3155655209996274,
|
|
"learning_rate": 1.5795454545454547e-06,
|
|
"loss": 1.5252,
|
|
"mean_token_accuracy": 0.6592924475669861,
|
|
"num_tokens": 12961701.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.47734375,
|
|
"epoch": 0.02050020500205002,
|
|
"grad_norm": 0.3139736395296892,
|
|
"learning_rate": 1.6931818181818182e-06,
|
|
"loss": 1.5433,
|
|
"mean_token_accuracy": 0.6573047280311585,
|
|
"num_tokens": 13905979.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.45546875,
|
|
"epoch": 0.02186688533552002,
|
|
"grad_norm": 0.2710070697267881,
|
|
"learning_rate": 1.8068181818181822e-06,
|
|
"loss": 1.5192,
|
|
"mean_token_accuracy": 0.6595899879932403,
|
|
"num_tokens": 14850397.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.47734375,
|
|
"epoch": 0.023233565668990024,
|
|
"grad_norm": 0.24050232785935366,
|
|
"learning_rate": 1.9204545454545457e-06,
|
|
"loss": 1.5366,
|
|
"mean_token_accuracy": 0.658087146282196,
|
|
"num_tokens": 15703108.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.4296875,
|
|
"epoch": 0.024600246002460024,
|
|
"grad_norm": 0.1908944905029829,
|
|
"learning_rate": 2.034090909090909e-06,
|
|
"loss": 1.4693,
|
|
"mean_token_accuracy": 0.6639279246330261,
|
|
"num_tokens": 16615464.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 1.48046875,
|
|
"epoch": 0.025966926335930025,
|
|
"grad_norm": 0.2330191455995494,
|
|
"learning_rate": 2.147727272727273e-06,
|
|
"loss": 1.5131,
|
|
"mean_token_accuracy": 0.6539380729198456,
|
|
"num_tokens": 17581557.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.47578125,
|
|
"epoch": 0.02733360666940003,
|
|
"grad_norm": 0.18985747845348652,
|
|
"learning_rate": 2.2613636363636366e-06,
|
|
"loss": 1.524,
|
|
"mean_token_accuracy": 0.6568748593330384,
|
|
"num_tokens": 18451335.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.45703125,
|
|
"epoch": 0.02870028700287003,
|
|
"grad_norm": 0.18851256475372202,
|
|
"learning_rate": 2.375e-06,
|
|
"loss": 1.4994,
|
|
"mean_token_accuracy": 0.6567203879356385,
|
|
"num_tokens": 19392109.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.47421875,
|
|
"epoch": 0.03006696733634003,
|
|
"grad_norm": 0.1869946530622133,
|
|
"learning_rate": 2.488636363636364e-06,
|
|
"loss": 1.5002,
|
|
"mean_token_accuracy": 0.6601383328437805,
|
|
"num_tokens": 20323700.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.40390625,
|
|
"epoch": 0.03143364766981003,
|
|
"grad_norm": 0.18011358783140455,
|
|
"learning_rate": 2.6022727272727276e-06,
|
|
"loss": 1.4437,
|
|
"mean_token_accuracy": 0.6687813222408294,
|
|
"num_tokens": 21263745.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.465625,
|
|
"epoch": 0.03280032800328003,
|
|
"grad_norm": 0.17619192304877396,
|
|
"learning_rate": 2.715909090909091e-06,
|
|
"loss": 1.4938,
|
|
"mean_token_accuracy": 0.660740327835083,
|
|
"num_tokens": 22195458.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.46171875,
|
|
"epoch": 0.034167008336750036,
|
|
"grad_norm": 0.1616653319492992,
|
|
"learning_rate": 2.829545454545455e-06,
|
|
"loss": 1.4842,
|
|
"mean_token_accuracy": 0.6650952160358429,
|
|
"num_tokens": 23115090.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.4421875,
|
|
"epoch": 0.03553368867022003,
|
|
"grad_norm": 0.18903704247279193,
|
|
"learning_rate": 2.9431818181818185e-06,
|
|
"loss": 1.466,
|
|
"mean_token_accuracy": 0.666004866361618,
|
|
"num_tokens": 24043510.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.44609375,
|
|
"epoch": 0.03690036900369004,
|
|
"grad_norm": 0.14308849649518682,
|
|
"learning_rate": 3.056818181818182e-06,
|
|
"loss": 1.486,
|
|
"mean_token_accuracy": 0.663190883398056,
|
|
"num_tokens": 24975455.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.403125,
|
|
"epoch": 0.03826704933716004,
|
|
"grad_norm": 0.14319424993511848,
|
|
"learning_rate": 3.1704545454545456e-06,
|
|
"loss": 1.4138,
|
|
"mean_token_accuracy": 0.6741089940071106,
|
|
"num_tokens": 25915880.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.428125,
|
|
"epoch": 0.03963372967063004,
|
|
"grad_norm": 0.15804156521630552,
|
|
"learning_rate": 3.2840909090909095e-06,
|
|
"loss": 1.442,
|
|
"mean_token_accuracy": 0.6689739406108857,
|
|
"num_tokens": 26836649.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.390625,
|
|
"epoch": 0.04100041000410004,
|
|
"grad_norm": 0.124906699468033,
|
|
"learning_rate": 3.397727272727273e-06,
|
|
"loss": 1.4263,
|
|
"mean_token_accuracy": 0.6699987471103668,
|
|
"num_tokens": 27758649.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.44921875,
|
|
"epoch": 0.042367090337570044,
|
|
"grad_norm": 0.1752543382503028,
|
|
"learning_rate": 3.5113636363636365e-06,
|
|
"loss": 1.4672,
|
|
"mean_token_accuracy": 0.6654171407222748,
|
|
"num_tokens": 28704182.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.41484375,
|
|
"epoch": 0.04373377067104004,
|
|
"grad_norm": 0.1235773282266241,
|
|
"learning_rate": 3.625e-06,
|
|
"loss": 1.4291,
|
|
"mean_token_accuracy": 0.6701051831245423,
|
|
"num_tokens": 29605445.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 1.503125,
|
|
"epoch": 0.045100451004510045,
|
|
"grad_norm": 0.13641416482484683,
|
|
"learning_rate": 3.7386363636363635e-06,
|
|
"loss": 1.512,
|
|
"mean_token_accuracy": 0.6590228378772736,
|
|
"num_tokens": 30515940.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.38203125,
|
|
"epoch": 0.04646713133798005,
|
|
"grad_norm": 0.14899328929906422,
|
|
"learning_rate": 3.852272727272728e-06,
|
|
"loss": 1.3889,
|
|
"mean_token_accuracy": 0.6781814873218537,
|
|
"num_tokens": 31408267.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.3625,
|
|
"epoch": 0.047833811671450045,
|
|
"grad_norm": 0.1345647318405238,
|
|
"learning_rate": 3.965909090909091e-06,
|
|
"loss": 1.3637,
|
|
"mean_token_accuracy": 0.6819893181324005,
|
|
"num_tokens": 32349688.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.43359375,
|
|
"epoch": 0.04920049200492005,
|
|
"grad_norm": 0.12934645839708656,
|
|
"learning_rate": 4.079545454545455e-06,
|
|
"loss": 1.452,
|
|
"mean_token_accuracy": 0.6644376397132874,
|
|
"num_tokens": 33292692.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 1.45,
|
|
"epoch": 0.05056717233839005,
|
|
"grad_norm": 0.17012336457086744,
|
|
"learning_rate": 4.193181818181819e-06,
|
|
"loss": 1.4607,
|
|
"mean_token_accuracy": 0.6657275080680847,
|
|
"num_tokens": 34223187.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.4625,
|
|
"epoch": 0.05193385267186005,
|
|
"grad_norm": 0.1344284229785903,
|
|
"learning_rate": 4.306818181818182e-06,
|
|
"loss": 1.474,
|
|
"mean_token_accuracy": 0.6631007730960846,
|
|
"num_tokens": 35129123.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.3890625,
|
|
"epoch": 0.05330053300533005,
|
|
"grad_norm": 0.15153938796417887,
|
|
"learning_rate": 4.420454545454546e-06,
|
|
"loss": 1.4061,
|
|
"mean_token_accuracy": 0.6744240164756775,
|
|
"num_tokens": 36084818.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.4015625,
|
|
"epoch": 0.05466721333880006,
|
|
"grad_norm": 0.11624270222426862,
|
|
"learning_rate": 4.53409090909091e-06,
|
|
"loss": 1.3975,
|
|
"mean_token_accuracy": 0.6772325575351715,
|
|
"num_tokens": 36969815.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 1.44375,
|
|
"epoch": 0.05603389367227005,
|
|
"grad_norm": 0.14355551743068182,
|
|
"learning_rate": 4.647727272727273e-06,
|
|
"loss": 1.4538,
|
|
"mean_token_accuracy": 0.6651172697544098,
|
|
"num_tokens": 37874200.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.409375,
|
|
"epoch": 0.05740057400574006,
|
|
"grad_norm": 0.12474374913697618,
|
|
"learning_rate": 4.761363636363637e-06,
|
|
"loss": 1.4293,
|
|
"mean_token_accuracy": 0.6726834654808045,
|
|
"num_tokens": 38814930.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.39140625,
|
|
"epoch": 0.05876725433921006,
|
|
"grad_norm": 0.1411200656633281,
|
|
"learning_rate": 4.875e-06,
|
|
"loss": 1.4052,
|
|
"mean_token_accuracy": 0.6740757107734681,
|
|
"num_tokens": 39757293.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.428125,
|
|
"epoch": 0.06013393467268006,
|
|
"grad_norm": 0.1202916268788176,
|
|
"learning_rate": 4.988636363636364e-06,
|
|
"loss": 1.4446,
|
|
"mean_token_accuracy": 0.6672864079475402,
|
|
"num_tokens": 40713143.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.43046875,
|
|
"epoch": 0.06150061500615006,
|
|
"grad_norm": 0.12724146812358164,
|
|
"learning_rate": 4.996829646329435e-06,
|
|
"loss": 1.4447,
|
|
"mean_token_accuracy": 0.667222660779953,
|
|
"num_tokens": 41638884.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.4640625,
|
|
"epoch": 0.06286729533962006,
|
|
"grad_norm": 0.12499733423435466,
|
|
"learning_rate": 4.993307031139919e-06,
|
|
"loss": 1.4762,
|
|
"mean_token_accuracy": 0.6644554138183594,
|
|
"num_tokens": 42583453.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.4390625,
|
|
"epoch": 0.06423397567309007,
|
|
"grad_norm": 0.11801225442568575,
|
|
"learning_rate": 4.989784415950402e-06,
|
|
"loss": 1.4441,
|
|
"mean_token_accuracy": 0.6677239418029786,
|
|
"num_tokens": 43522757.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 1.5,
|
|
"epoch": 0.06560065600656007,
|
|
"grad_norm": 0.12119940721330286,
|
|
"learning_rate": 4.986261800760885e-06,
|
|
"loss": 1.5158,
|
|
"mean_token_accuracy": 0.6571428835391998,
|
|
"num_tokens": 44444147.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.4125,
|
|
"epoch": 0.06696733634003006,
|
|
"grad_norm": 0.1344425715421983,
|
|
"learning_rate": 4.9827391855713685e-06,
|
|
"loss": 1.4207,
|
|
"mean_token_accuracy": 0.675386369228363,
|
|
"num_tokens": 45400305.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.41015625,
|
|
"epoch": 0.06833401667350007,
|
|
"grad_norm": 0.1500027528004066,
|
|
"learning_rate": 4.979216570381852e-06,
|
|
"loss": 1.4057,
|
|
"mean_token_accuracy": 0.6719016253948211,
|
|
"num_tokens": 46325614.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.396875,
|
|
"epoch": 0.06970069700697007,
|
|
"grad_norm": 0.11808172820096659,
|
|
"learning_rate": 4.975693955192335e-06,
|
|
"loss": 1.4115,
|
|
"mean_token_accuracy": 0.6707705914974212,
|
|
"num_tokens": 47238170.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.43828125,
|
|
"epoch": 0.07106737734044007,
|
|
"grad_norm": 0.12254010166447069,
|
|
"learning_rate": 4.972171340002819e-06,
|
|
"loss": 1.4409,
|
|
"mean_token_accuracy": 0.66942298412323,
|
|
"num_tokens": 48194911.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 1.4328125,
|
|
"epoch": 0.07243405767391008,
|
|
"grad_norm": 0.13124483719382588,
|
|
"learning_rate": 4.968648724813302e-06,
|
|
"loss": 1.4411,
|
|
"mean_token_accuracy": 0.6693200290203094,
|
|
"num_tokens": 49112005.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 1.3984375,
|
|
"epoch": 0.07380073800738007,
|
|
"grad_norm": 0.12845290720748656,
|
|
"learning_rate": 4.965126109623785e-06,
|
|
"loss": 1.4055,
|
|
"mean_token_accuracy": 0.6728921890258789,
|
|
"num_tokens": 50015461.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 1.4140625,
|
|
"epoch": 0.07516741834085007,
|
|
"grad_norm": 0.11315252176838612,
|
|
"learning_rate": 4.961603494434268e-06,
|
|
"loss": 1.4284,
|
|
"mean_token_accuracy": 0.6706915199756622,
|
|
"num_tokens": 50935643.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.38203125,
|
|
"epoch": 0.07653409867432008,
|
|
"grad_norm": 0.1341179278369481,
|
|
"learning_rate": 4.958080879244752e-06,
|
|
"loss": 1.3874,
|
|
"mean_token_accuracy": 0.6761618852615356,
|
|
"num_tokens": 51878002.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 1.38125,
|
|
"epoch": 0.07790077900779008,
|
|
"grad_norm": 0.12268934214914148,
|
|
"learning_rate": 4.954558264055234e-06,
|
|
"loss": 1.394,
|
|
"mean_token_accuracy": 0.6747714340686798,
|
|
"num_tokens": 52758343.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 1.365625,
|
|
"epoch": 0.07926745934126007,
|
|
"grad_norm": 0.14305426716463207,
|
|
"learning_rate": 4.951035648865719e-06,
|
|
"loss": 1.3645,
|
|
"mean_token_accuracy": 0.683729475736618,
|
|
"num_tokens": 53650442.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 1.38671875,
|
|
"epoch": 0.08063413967473008,
|
|
"grad_norm": 0.13014661812691572,
|
|
"learning_rate": 4.9475130336762015e-06,
|
|
"loss": 1.3907,
|
|
"mean_token_accuracy": 0.675061148405075,
|
|
"num_tokens": 54555977.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 1.425,
|
|
"epoch": 0.08200082000820008,
|
|
"grad_norm": 0.11878249791005921,
|
|
"learning_rate": 4.943990418486685e-06,
|
|
"loss": 1.4276,
|
|
"mean_token_accuracy": 0.6679265916347503,
|
|
"num_tokens": 55522236.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.4171875,
|
|
"epoch": 0.08336750034167008,
|
|
"grad_norm": 0.1320477678271073,
|
|
"learning_rate": 4.9404678032971685e-06,
|
|
"loss": 1.4219,
|
|
"mean_token_accuracy": 0.67267826795578,
|
|
"num_tokens": 56427197.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 1.3734375,
|
|
"epoch": 0.08473418067514009,
|
|
"grad_norm": 0.11922262444853492,
|
|
"learning_rate": 4.936945188107651e-06,
|
|
"loss": 1.3904,
|
|
"mean_token_accuracy": 0.6790150582790375,
|
|
"num_tokens": 57362921.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 1.34296875,
|
|
"epoch": 0.08610086100861009,
|
|
"grad_norm": 0.13361704065028643,
|
|
"learning_rate": 4.933422572918135e-06,
|
|
"loss": 1.3437,
|
|
"mean_token_accuracy": 0.6859430849552155,
|
|
"num_tokens": 58256680.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 1.375,
|
|
"epoch": 0.08746754134208008,
|
|
"grad_norm": 0.14334454832319102,
|
|
"learning_rate": 4.929899957728618e-06,
|
|
"loss": 1.3772,
|
|
"mean_token_accuracy": 0.6794634640216828,
|
|
"num_tokens": 59193508.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 1.384375,
|
|
"epoch": 0.08883422167555009,
|
|
"grad_norm": 0.1451016446607481,
|
|
"learning_rate": 4.926377342539102e-06,
|
|
"loss": 1.382,
|
|
"mean_token_accuracy": 0.677071726322174,
|
|
"num_tokens": 60108420.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.3625,
|
|
"epoch": 0.09020090200902009,
|
|
"grad_norm": 0.1305313170079439,
|
|
"learning_rate": 4.922854727349585e-06,
|
|
"loss": 1.3626,
|
|
"mean_token_accuracy": 0.6805915176868439,
|
|
"num_tokens": 61034298.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 1.4,
|
|
"epoch": 0.09156758234249009,
|
|
"grad_norm": 0.15065559389600378,
|
|
"learning_rate": 4.919332112160068e-06,
|
|
"loss": 1.4037,
|
|
"mean_token_accuracy": 0.6770659208297729,
|
|
"num_tokens": 61983537.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 1.378125,
|
|
"epoch": 0.0929342626759601,
|
|
"grad_norm": 0.1344239842706082,
|
|
"learning_rate": 4.915809496970551e-06,
|
|
"loss": 1.3903,
|
|
"mean_token_accuracy": 0.6754595756530761,
|
|
"num_tokens": 62935993.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 1.46015625,
|
|
"epoch": 0.0943009430094301,
|
|
"grad_norm": 0.12103976931407226,
|
|
"learning_rate": 4.912286881781035e-06,
|
|
"loss": 1.4785,
|
|
"mean_token_accuracy": 0.6603707492351532,
|
|
"num_tokens": 63853924.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 1.37421875,
|
|
"epoch": 0.09566762334290009,
|
|
"grad_norm": 0.1777982351842348,
|
|
"learning_rate": 4.908764266591518e-06,
|
|
"loss": 1.3789,
|
|
"mean_token_accuracy": 0.6775857329368591,
|
|
"num_tokens": 64763931.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.38984375,
|
|
"epoch": 0.0970343036763701,
|
|
"grad_norm": 0.13917687782356375,
|
|
"learning_rate": 4.9052416514020015e-06,
|
|
"loss": 1.3894,
|
|
"mean_token_accuracy": 0.6767784059047699,
|
|
"num_tokens": 65673140.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 1.34609375,
|
|
"epoch": 0.0984009840098401,
|
|
"grad_norm": 0.12897379556228614,
|
|
"learning_rate": 4.901719036212484e-06,
|
|
"loss": 1.3579,
|
|
"mean_token_accuracy": 0.6811029970645904,
|
|
"num_tokens": 66640391.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 1.35,
|
|
"epoch": 0.0997676643433101,
|
|
"grad_norm": 0.12382476405803816,
|
|
"learning_rate": 4.898196421022968e-06,
|
|
"loss": 1.3589,
|
|
"mean_token_accuracy": 0.6841527640819549,
|
|
"num_tokens": 67563282.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 1.35078125,
|
|
"epoch": 0.1011343446767801,
|
|
"grad_norm": 0.14087040614479748,
|
|
"learning_rate": 4.894673805833451e-06,
|
|
"loss": 1.3327,
|
|
"mean_token_accuracy": 0.6857786476612091,
|
|
"num_tokens": 68416165.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 1.3828125,
|
|
"epoch": 0.1025010250102501,
|
|
"grad_norm": 0.11364526324484656,
|
|
"learning_rate": 4.891151190643935e-06,
|
|
"loss": 1.3917,
|
|
"mean_token_accuracy": 0.6771961390972138,
|
|
"num_tokens": 69336094.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.3609375,
|
|
"epoch": 0.1038677053437201,
|
|
"grad_norm": 0.11663110318817138,
|
|
"learning_rate": 4.8876285754544175e-06,
|
|
"loss": 1.3653,
|
|
"mean_token_accuracy": 0.6796006381511688,
|
|
"num_tokens": 70292744.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 1.41875,
|
|
"epoch": 0.10523438567719011,
|
|
"grad_norm": 0.12374954253074405,
|
|
"learning_rate": 4.884105960264901e-06,
|
|
"loss": 1.4352,
|
|
"mean_token_accuracy": 0.6695616126060486,
|
|
"num_tokens": 71192370.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 1.35703125,
|
|
"epoch": 0.1066010660106601,
|
|
"grad_norm": 0.13767371767386172,
|
|
"learning_rate": 4.880583345075385e-06,
|
|
"loss": 1.3515,
|
|
"mean_token_accuracy": 0.6830070853233338,
|
|
"num_tokens": 72109962.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.1079677463441301,
|
|
"grad_norm": 0.12516004272108025,
|
|
"learning_rate": 4.877060729885867e-06,
|
|
"loss": 1.3286,
|
|
"mean_token_accuracy": 0.6861463725566864,
|
|
"num_tokens": 73022537.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 1.3546875,
|
|
"epoch": 0.10933442667760011,
|
|
"grad_norm": 0.14909207829927137,
|
|
"learning_rate": 4.873538114696351e-06,
|
|
"loss": 1.3633,
|
|
"mean_token_accuracy": 0.6801453471183777,
|
|
"num_tokens": 73932084.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.359375,
|
|
"epoch": 0.11070110701107011,
|
|
"grad_norm": 0.11880148658663449,
|
|
"learning_rate": 4.870015499506834e-06,
|
|
"loss": 1.3706,
|
|
"mean_token_accuracy": 0.6823614716529847,
|
|
"num_tokens": 74843608.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 1.3828125,
|
|
"epoch": 0.1120677873445401,
|
|
"grad_norm": 0.1196684274234683,
|
|
"learning_rate": 4.866492884317318e-06,
|
|
"loss": 1.3839,
|
|
"mean_token_accuracy": 0.676324874162674,
|
|
"num_tokens": 75769415.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 1.35390625,
|
|
"epoch": 0.11343446767801012,
|
|
"grad_norm": 0.12304584016836433,
|
|
"learning_rate": 4.862970269127801e-06,
|
|
"loss": 1.3656,
|
|
"mean_token_accuracy": 0.6842137157917023,
|
|
"num_tokens": 76676625.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 1.3859375,
|
|
"epoch": 0.11480114801148011,
|
|
"grad_norm": 0.12821137591802156,
|
|
"learning_rate": 4.859447653938284e-06,
|
|
"loss": 1.3736,
|
|
"mean_token_accuracy": 0.6776384353637696,
|
|
"num_tokens": 77575710.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 1.3984375,
|
|
"epoch": 0.11616782834495011,
|
|
"grad_norm": 0.1669162719855578,
|
|
"learning_rate": 4.855925038748768e-06,
|
|
"loss": 1.4155,
|
|
"mean_token_accuracy": 0.6739114463329315,
|
|
"num_tokens": 78573992.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.3796875,
|
|
"epoch": 0.11753450867842012,
|
|
"grad_norm": 0.14389796909220767,
|
|
"learning_rate": 4.852402423559251e-06,
|
|
"loss": 1.3911,
|
|
"mean_token_accuracy": 0.6772493362426758,
|
|
"num_tokens": 79532329.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 1.3984375,
|
|
"epoch": 0.11890118901189012,
|
|
"grad_norm": 0.12331565026496326,
|
|
"learning_rate": 4.848879808369734e-06,
|
|
"loss": 1.4177,
|
|
"mean_token_accuracy": 0.6740984797477723,
|
|
"num_tokens": 80438221.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 1.34453125,
|
|
"epoch": 0.12026786934536011,
|
|
"grad_norm": 0.11615929853443284,
|
|
"learning_rate": 4.8453571931802175e-06,
|
|
"loss": 1.3323,
|
|
"mean_token_accuracy": 0.6876966774463653,
|
|
"num_tokens": 81376762.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 1.31328125,
|
|
"epoch": 0.12163454967883013,
|
|
"grad_norm": 0.11650919358124776,
|
|
"learning_rate": 4.8418345779907e-06,
|
|
"loss": 1.3123,
|
|
"mean_token_accuracy": 0.6910358250141144,
|
|
"num_tokens": 82272468.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 1.40546875,
|
|
"epoch": 0.12300123001230012,
|
|
"grad_norm": 0.14987145425842305,
|
|
"learning_rate": 4.838311962801184e-06,
|
|
"loss": 1.417,
|
|
"mean_token_accuracy": 0.6733951926231384,
|
|
"num_tokens": 83215846.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.36171875,
|
|
"epoch": 0.12436791034577012,
|
|
"grad_norm": 0.12553148658834193,
|
|
"learning_rate": 4.834789347611667e-06,
|
|
"loss": 1.3722,
|
|
"mean_token_accuracy": 0.6807812750339508,
|
|
"num_tokens": 84093524.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 1.39375,
|
|
"epoch": 0.12573459067924012,
|
|
"grad_norm": 0.121953734062885,
|
|
"learning_rate": 4.831266732422151e-06,
|
|
"loss": 1.3866,
|
|
"mean_token_accuracy": 0.6781980335712433,
|
|
"num_tokens": 84978921.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 1.2828125,
|
|
"epoch": 0.12710127101271013,
|
|
"grad_norm": 0.12346276663782899,
|
|
"learning_rate": 4.827744117232634e-06,
|
|
"loss": 1.2785,
|
|
"mean_token_accuracy": 0.6969594538211823,
|
|
"num_tokens": 85868292.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 1.31953125,
|
|
"epoch": 0.12846795134618014,
|
|
"grad_norm": 0.14083948756009163,
|
|
"learning_rate": 4.824221502043117e-06,
|
|
"loss": 1.3124,
|
|
"mean_token_accuracy": 0.6884961247444152,
|
|
"num_tokens": 86759570.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 1.3734375,
|
|
"epoch": 0.12983463167965012,
|
|
"grad_norm": 0.15302007791467442,
|
|
"learning_rate": 4.820698886853601e-06,
|
|
"loss": 1.3725,
|
|
"mean_token_accuracy": 0.6753612399101258,
|
|
"num_tokens": 87742990.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.37265625,
|
|
"epoch": 0.13120131201312013,
|
|
"grad_norm": 0.13407326232441735,
|
|
"learning_rate": 4.817176271664084e-06,
|
|
"loss": 1.3744,
|
|
"mean_token_accuracy": 0.6773225545883179,
|
|
"num_tokens": 88685843.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 1.31640625,
|
|
"epoch": 0.13256799234659014,
|
|
"grad_norm": 0.13150009861374143,
|
|
"learning_rate": 4.813653656474567e-06,
|
|
"loss": 1.3151,
|
|
"mean_token_accuracy": 0.6871741890907288,
|
|
"num_tokens": 89560923.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 1.3640625,
|
|
"epoch": 0.13393467268006012,
|
|
"grad_norm": 0.13155969004390802,
|
|
"learning_rate": 4.8101310412850505e-06,
|
|
"loss": 1.3742,
|
|
"mean_token_accuracy": 0.6773284494876861,
|
|
"num_tokens": 90477434.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 1.346875,
|
|
"epoch": 0.13530135301353013,
|
|
"grad_norm": 0.14184151716420373,
|
|
"learning_rate": 4.806608426095534e-06,
|
|
"loss": 1.3586,
|
|
"mean_token_accuracy": 0.6832744061946869,
|
|
"num_tokens": 91445593.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 1.34453125,
|
|
"epoch": 0.13666803334700015,
|
|
"grad_norm": 0.13292633817156044,
|
|
"learning_rate": 4.803085810906017e-06,
|
|
"loss": 1.3485,
|
|
"mean_token_accuracy": 0.6847118556499481,
|
|
"num_tokens": 92372246.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.3953125,
|
|
"epoch": 0.13803471368047013,
|
|
"grad_norm": 0.1482501964671727,
|
|
"learning_rate": 4.7995631957165e-06,
|
|
"loss": 1.3989,
|
|
"mean_token_accuracy": 0.6725464284420013,
|
|
"num_tokens": 93322975.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 1.31171875,
|
|
"epoch": 0.13940139401394014,
|
|
"grad_norm": 0.12839624351826165,
|
|
"learning_rate": 4.796040580526984e-06,
|
|
"loss": 1.3159,
|
|
"mean_token_accuracy": 0.689539498090744,
|
|
"num_tokens": 94243005.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 1.34296875,
|
|
"epoch": 0.14076807434741015,
|
|
"grad_norm": 0.13808158041579283,
|
|
"learning_rate": 4.792517965337467e-06,
|
|
"loss": 1.3452,
|
|
"mean_token_accuracy": 0.6848942458629608,
|
|
"num_tokens": 95128367.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 1.32734375,
|
|
"epoch": 0.14213475468088013,
|
|
"grad_norm": 0.11791791459013876,
|
|
"learning_rate": 4.78899535014795e-06,
|
|
"loss": 1.3393,
|
|
"mean_token_accuracy": 0.6859888076782227,
|
|
"num_tokens": 96052977.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 1.33203125,
|
|
"epoch": 0.14350143501435014,
|
|
"grad_norm": 0.11647103093576822,
|
|
"learning_rate": 4.785472734958434e-06,
|
|
"loss": 1.3321,
|
|
"mean_token_accuracy": 0.6858505129814148,
|
|
"num_tokens": 96997179.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.36953125,
|
|
"epoch": 0.14486811534782015,
|
|
"grad_norm": 0.17051435435259377,
|
|
"learning_rate": 4.781950119768916e-06,
|
|
"loss": 1.3801,
|
|
"mean_token_accuracy": 0.678006249666214,
|
|
"num_tokens": 97934565.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 1.3046875,
|
|
"epoch": 0.14623479568129014,
|
|
"grad_norm": 0.137943738792979,
|
|
"learning_rate": 4.778427504579401e-06,
|
|
"loss": 1.2851,
|
|
"mean_token_accuracy": 0.6944494187831879,
|
|
"num_tokens": 98864499.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 1.309375,
|
|
"epoch": 0.14760147601476015,
|
|
"grad_norm": 0.11725105476013466,
|
|
"learning_rate": 4.774904889389883e-06,
|
|
"loss": 1.3054,
|
|
"mean_token_accuracy": 0.6890417337417603,
|
|
"num_tokens": 99822130.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 1.325,
|
|
"epoch": 0.14896815634823016,
|
|
"grad_norm": 0.12861475595264635,
|
|
"learning_rate": 4.771382274200367e-06,
|
|
"loss": 1.3323,
|
|
"mean_token_accuracy": 0.6880747437477112,
|
|
"num_tokens": 100744706.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 1.36796875,
|
|
"epoch": 0.15033483668170014,
|
|
"grad_norm": 0.18621115628087576,
|
|
"learning_rate": 4.76785965901085e-06,
|
|
"loss": 1.3672,
|
|
"mean_token_accuracy": 0.6786254942417145,
|
|
"num_tokens": 101705020.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.39765625,
|
|
"epoch": 0.15170151701517015,
|
|
"grad_norm": 0.12973492794407704,
|
|
"learning_rate": 4.764337043821333e-06,
|
|
"loss": 1.4088,
|
|
"mean_token_accuracy": 0.6722543716430665,
|
|
"num_tokens": 102616456.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 1.35859375,
|
|
"epoch": 0.15306819734864016,
|
|
"grad_norm": 0.13486096568052702,
|
|
"learning_rate": 4.760814428631817e-06,
|
|
"loss": 1.3565,
|
|
"mean_token_accuracy": 0.6823795199394226,
|
|
"num_tokens": 103534288.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.15443487768211014,
|
|
"grad_norm": 0.1432821422668355,
|
|
"learning_rate": 4.7572918134423e-06,
|
|
"loss": 1.3204,
|
|
"mean_token_accuracy": 0.6877013444900513,
|
|
"num_tokens": 104412112.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 1.3015625,
|
|
"epoch": 0.15580155801558015,
|
|
"grad_norm": 0.10574942782081341,
|
|
"learning_rate": 4.753769198252783e-06,
|
|
"loss": 1.3099,
|
|
"mean_token_accuracy": 0.690114563703537,
|
|
"num_tokens": 105339912.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 1.3921875,
|
|
"epoch": 0.15716823834905017,
|
|
"grad_norm": 0.18056603090234413,
|
|
"learning_rate": 4.7502465830632665e-06,
|
|
"loss": 1.3955,
|
|
"mean_token_accuracy": 0.6749017894268036,
|
|
"num_tokens": 106300822.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.29296875,
|
|
"epoch": 0.15853491868252015,
|
|
"grad_norm": 0.11796654131940439,
|
|
"learning_rate": 4.74672396787375e-06,
|
|
"loss": 1.2874,
|
|
"mean_token_accuracy": 0.6964833796024322,
|
|
"num_tokens": 107234943.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 1.32109375,
|
|
"epoch": 0.15990159901599016,
|
|
"grad_norm": 0.16131109040770894,
|
|
"learning_rate": 4.743201352684233e-06,
|
|
"loss": 1.3215,
|
|
"mean_token_accuracy": 0.6890954613685608,
|
|
"num_tokens": 108169288.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 1.321875,
|
|
"epoch": 0.16126827934946017,
|
|
"grad_norm": 0.13893865395668883,
|
|
"learning_rate": 4.739678737494716e-06,
|
|
"loss": 1.3263,
|
|
"mean_token_accuracy": 0.6901063919067383,
|
|
"num_tokens": 109066252.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 1.340625,
|
|
"epoch": 0.16263495968293015,
|
|
"grad_norm": 0.12921678168700804,
|
|
"learning_rate": 4.7361561223052e-06,
|
|
"loss": 1.3393,
|
|
"mean_token_accuracy": 0.683655458688736,
|
|
"num_tokens": 109981241.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 1.32109375,
|
|
"epoch": 0.16400164001640016,
|
|
"grad_norm": 0.16273875849750358,
|
|
"learning_rate": 4.7326335071156834e-06,
|
|
"loss": 1.3269,
|
|
"mean_token_accuracy": 0.6895385205745697,
|
|
"num_tokens": 110887025.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.33125,
|
|
"epoch": 0.16536832034987017,
|
|
"grad_norm": 0.11415462166905854,
|
|
"learning_rate": 4.729110891926166e-06,
|
|
"loss": 1.3436,
|
|
"mean_token_accuracy": 0.6842290580272674,
|
|
"num_tokens": 111835136.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 1.31953125,
|
|
"epoch": 0.16673500068334016,
|
|
"grad_norm": 0.15714405710599533,
|
|
"learning_rate": 4.72558827673665e-06,
|
|
"loss": 1.3299,
|
|
"mean_token_accuracy": 0.6875687420368195,
|
|
"num_tokens": 112715049.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.16810168101681017,
|
|
"grad_norm": 0.12756586305591994,
|
|
"learning_rate": 4.722065661547132e-06,
|
|
"loss": 1.3462,
|
|
"mean_token_accuracy": 0.6866683483123779,
|
|
"num_tokens": 113645212.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 1.35,
|
|
"epoch": 0.16946836135028018,
|
|
"grad_norm": 0.14428373611987774,
|
|
"learning_rate": 4.718543046357617e-06,
|
|
"loss": 1.3512,
|
|
"mean_token_accuracy": 0.6830075085163116,
|
|
"num_tokens": 114607757.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 1.27421875,
|
|
"epoch": 0.17083504168375016,
|
|
"grad_norm": 0.12324571354054409,
|
|
"learning_rate": 4.7150204311680995e-06,
|
|
"loss": 1.2856,
|
|
"mean_token_accuracy": 0.6924615323543548,
|
|
"num_tokens": 115481536.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.3390625,
|
|
"epoch": 0.17220172201722017,
|
|
"grad_norm": 0.12330003123550451,
|
|
"learning_rate": 4.711497815978583e-06,
|
|
"loss": 1.3414,
|
|
"mean_token_accuracy": 0.6857994675636292,
|
|
"num_tokens": 116398355.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 1.3390625,
|
|
"epoch": 0.17356840235069018,
|
|
"grad_norm": 0.10918171452015389,
|
|
"learning_rate": 4.707975200789066e-06,
|
|
"loss": 1.3571,
|
|
"mean_token_accuracy": 0.6810788214206696,
|
|
"num_tokens": 117363895.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 1.34140625,
|
|
"epoch": 0.17493508268416016,
|
|
"grad_norm": 0.12901991491666207,
|
|
"learning_rate": 4.704452585599549e-06,
|
|
"loss": 1.3464,
|
|
"mean_token_accuracy": 0.6822156429290771,
|
|
"num_tokens": 118281157.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 1.31640625,
|
|
"epoch": 0.17630176301763018,
|
|
"grad_norm": 0.14982760178070825,
|
|
"learning_rate": 4.700929970410033e-06,
|
|
"loss": 1.3179,
|
|
"mean_token_accuracy": 0.6894472658634185,
|
|
"num_tokens": 119199903.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.17766844335110019,
|
|
"grad_norm": 0.11823900894754984,
|
|
"learning_rate": 4.697407355220516e-06,
|
|
"loss": 1.3234,
|
|
"mean_token_accuracy": 0.6892919421195984,
|
|
"num_tokens": 120153384.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.31171875,
|
|
"epoch": 0.17903512368457017,
|
|
"grad_norm": 0.13029282997419595,
|
|
"learning_rate": 4.693884740030999e-06,
|
|
"loss": 1.3047,
|
|
"mean_token_accuracy": 0.6887700617313385,
|
|
"num_tokens": 121028690.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 1.3578125,
|
|
"epoch": 0.18040180401804018,
|
|
"grad_norm": 0.12345180784953408,
|
|
"learning_rate": 4.690362124841483e-06,
|
|
"loss": 1.3534,
|
|
"mean_token_accuracy": 0.6842653691768646,
|
|
"num_tokens": 121950665.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 1.334375,
|
|
"epoch": 0.1817684843515102,
|
|
"grad_norm": 0.2280625631130464,
|
|
"learning_rate": 4.686839509651966e-06,
|
|
"loss": 1.3442,
|
|
"mean_token_accuracy": 0.6831065654754639,
|
|
"num_tokens": 122875209.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.18313516468498017,
|
|
"grad_norm": 0.11812737856678532,
|
|
"learning_rate": 4.683316894462449e-06,
|
|
"loss": 1.329,
|
|
"mean_token_accuracy": 0.6846230745315551,
|
|
"num_tokens": 123817600.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 1.30625,
|
|
"epoch": 0.18450184501845018,
|
|
"grad_norm": 0.11090395848334941,
|
|
"learning_rate": 4.679794279272933e-06,
|
|
"loss": 1.3209,
|
|
"mean_token_accuracy": 0.6878692448139191,
|
|
"num_tokens": 124741536.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.32421875,
|
|
"epoch": 0.1858685253519202,
|
|
"grad_norm": 0.12775788543204933,
|
|
"learning_rate": 4.676271664083416e-06,
|
|
"loss": 1.3377,
|
|
"mean_token_accuracy": 0.6858314096927642,
|
|
"num_tokens": 125689703.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 1.38984375,
|
|
"epoch": 0.18723520568539018,
|
|
"grad_norm": 0.7082128816943113,
|
|
"learning_rate": 4.6727490488938995e-06,
|
|
"loss": 1.3918,
|
|
"mean_token_accuracy": 0.6760635197162628,
|
|
"num_tokens": 126565228.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 1.31875,
|
|
"epoch": 0.1886018860188602,
|
|
"grad_norm": 0.12659400134478793,
|
|
"learning_rate": 4.669226433704382e-06,
|
|
"loss": 1.3167,
|
|
"mean_token_accuracy": 0.6889874815940857,
|
|
"num_tokens": 127484119.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 1.296875,
|
|
"epoch": 0.1899685663523302,
|
|
"grad_norm": 0.12228348593583398,
|
|
"learning_rate": 4.665703818514866e-06,
|
|
"loss": 1.3078,
|
|
"mean_token_accuracy": 0.6924630641937256,
|
|
"num_tokens": 128382056.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 1.29921875,
|
|
"epoch": 0.19133524668580018,
|
|
"grad_norm": 0.12142231773421094,
|
|
"learning_rate": 4.6621812033253484e-06,
|
|
"loss": 1.3016,
|
|
"mean_token_accuracy": 0.6917517781257629,
|
|
"num_tokens": 129302334.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.1927019270192702,
|
|
"grad_norm": 0.18646381158516584,
|
|
"learning_rate": 4.658658588135833e-06,
|
|
"loss": 1.2195,
|
|
"mean_token_accuracy": 0.7072098851203918,
|
|
"num_tokens": 130215453.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 1.2875,
|
|
"epoch": 0.1940686073527402,
|
|
"grad_norm": 0.13247480136147055,
|
|
"learning_rate": 4.6551359729463155e-06,
|
|
"loss": 1.2738,
|
|
"mean_token_accuracy": 0.6982071995735168,
|
|
"num_tokens": 131191399.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 1.35,
|
|
"epoch": 0.19543528768621019,
|
|
"grad_norm": 0.12050205689995584,
|
|
"learning_rate": 4.651613357756799e-06,
|
|
"loss": 1.3619,
|
|
"mean_token_accuracy": 0.6786090016365052,
|
|
"num_tokens": 132122523.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 1.35546875,
|
|
"epoch": 0.1968019680196802,
|
|
"grad_norm": 0.14326940963118967,
|
|
"learning_rate": 4.648090742567283e-06,
|
|
"loss": 1.3771,
|
|
"mean_token_accuracy": 0.6800052225589752,
|
|
"num_tokens": 133055902.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 1.3390625,
|
|
"epoch": 0.1981686483531502,
|
|
"grad_norm": 0.12484597455577315,
|
|
"learning_rate": 4.644568127377765e-06,
|
|
"loss": 1.3431,
|
|
"mean_token_accuracy": 0.6844324350357056,
|
|
"num_tokens": 134007365.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.33515625,
|
|
"epoch": 0.1995353286866202,
|
|
"grad_norm": 0.11434280173128462,
|
|
"learning_rate": 4.641045512188249e-06,
|
|
"loss": 1.3423,
|
|
"mean_token_accuracy": 0.6854045927524567,
|
|
"num_tokens": 134939937.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 1.2921875,
|
|
"epoch": 0.2009020090200902,
|
|
"grad_norm": 0.13022998625627383,
|
|
"learning_rate": 4.6375228969987324e-06,
|
|
"loss": 1.302,
|
|
"mean_token_accuracy": 0.6887432277202606,
|
|
"num_tokens": 135883157.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.2022686893535602,
|
|
"grad_norm": 0.13801851842194432,
|
|
"learning_rate": 4.634000281809216e-06,
|
|
"loss": 1.3328,
|
|
"mean_token_accuracy": 0.6865583240985871,
|
|
"num_tokens": 136826502.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 1.309375,
|
|
"epoch": 0.2036353696870302,
|
|
"grad_norm": 0.1403373816950752,
|
|
"learning_rate": 4.630477666619699e-06,
|
|
"loss": 1.3047,
|
|
"mean_token_accuracy": 0.6914615392684936,
|
|
"num_tokens": 137729172.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 1.26640625,
|
|
"epoch": 0.2050020500205002,
|
|
"grad_norm": 0.12590330828751753,
|
|
"learning_rate": 4.626955051430182e-06,
|
|
"loss": 1.2692,
|
|
"mean_token_accuracy": 0.7010594725608825,
|
|
"num_tokens": 138673916.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.33359375,
|
|
"epoch": 0.20636873035397021,
|
|
"grad_norm": 0.19760019314069127,
|
|
"learning_rate": 4.623432436240665e-06,
|
|
"loss": 1.3345,
|
|
"mean_token_accuracy": 0.6868266880512237,
|
|
"num_tokens": 139533756.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 1.28515625,
|
|
"epoch": 0.2077354106874402,
|
|
"grad_norm": 0.12909608214725019,
|
|
"learning_rate": 4.619909821051149e-06,
|
|
"loss": 1.289,
|
|
"mean_token_accuracy": 0.6937914311885833,
|
|
"num_tokens": 140388113.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 1.31171875,
|
|
"epoch": 0.2091020910209102,
|
|
"grad_norm": 0.12878351348173622,
|
|
"learning_rate": 4.616387205861632e-06,
|
|
"loss": 1.3197,
|
|
"mean_token_accuracy": 0.6870058596134185,
|
|
"num_tokens": 141300644.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 1.3109375,
|
|
"epoch": 0.21046877135438022,
|
|
"grad_norm": 0.1296951138547516,
|
|
"learning_rate": 4.6128645906721156e-06,
|
|
"loss": 1.3127,
|
|
"mean_token_accuracy": 0.6906398415565491,
|
|
"num_tokens": 142178254.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 1.3625,
|
|
"epoch": 0.2118354516878502,
|
|
"grad_norm": 0.14747847604373324,
|
|
"learning_rate": 4.609341975482598e-06,
|
|
"loss": 1.3717,
|
|
"mean_token_accuracy": 0.6806895673274994,
|
|
"num_tokens": 143088982.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.3140625,
|
|
"epoch": 0.2132021320213202,
|
|
"grad_norm": 0.12372767579199202,
|
|
"learning_rate": 4.605819360293082e-06,
|
|
"loss": 1.3148,
|
|
"mean_token_accuracy": 0.6898377060890197,
|
|
"num_tokens": 144044472.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 1.26953125,
|
|
"epoch": 0.21456881235479022,
|
|
"grad_norm": 0.10901183388662893,
|
|
"learning_rate": 4.602296745103565e-06,
|
|
"loss": 1.2747,
|
|
"mean_token_accuracy": 0.6975809335708618,
|
|
"num_tokens": 144956757.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 1.32890625,
|
|
"epoch": 0.2159354926882602,
|
|
"grad_norm": 0.12781878640132094,
|
|
"learning_rate": 4.598774129914049e-06,
|
|
"loss": 1.3589,
|
|
"mean_token_accuracy": 0.6819869220256806,
|
|
"num_tokens": 145879897.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 1.3328125,
|
|
"epoch": 0.21730217302173022,
|
|
"grad_norm": 0.1324933715112407,
|
|
"learning_rate": 4.595251514724532e-06,
|
|
"loss": 1.3215,
|
|
"mean_token_accuracy": 0.6851811587810517,
|
|
"num_tokens": 146783735.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 1.30546875,
|
|
"epoch": 0.21866885335520023,
|
|
"grad_norm": 0.11189031978182414,
|
|
"learning_rate": 4.591728899535015e-06,
|
|
"loss": 1.2939,
|
|
"mean_token_accuracy": 0.6920007169246674,
|
|
"num_tokens": 147702553.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.2200355336886702,
|
|
"grad_norm": 0.13321003131742193,
|
|
"learning_rate": 4.588206284345499e-06,
|
|
"loss": 1.268,
|
|
"mean_token_accuracy": 0.6966476440429688,
|
|
"num_tokens": 148628418.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 1.38671875,
|
|
"epoch": 0.22140221402214022,
|
|
"grad_norm": 0.1250615719937732,
|
|
"learning_rate": 4.584683669155981e-06,
|
|
"loss": 1.397,
|
|
"mean_token_accuracy": 0.6745240449905395,
|
|
"num_tokens": 149568000.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 1.3515625,
|
|
"epoch": 0.22276889435561023,
|
|
"grad_norm": 0.14213937674136803,
|
|
"learning_rate": 4.581161053966465e-06,
|
|
"loss": 1.3547,
|
|
"mean_token_accuracy": 0.6849486649036407,
|
|
"num_tokens": 150546517.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 1.34921875,
|
|
"epoch": 0.2241355746890802,
|
|
"grad_norm": 0.13041928358014948,
|
|
"learning_rate": 4.5776384387769485e-06,
|
|
"loss": 1.3681,
|
|
"mean_token_accuracy": 0.6824446976184845,
|
|
"num_tokens": 151479506.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.22550225502255022,
|
|
"grad_norm": 0.13822883890567075,
|
|
"learning_rate": 4.574115823587432e-06,
|
|
"loss": 1.2942,
|
|
"mean_token_accuracy": 0.6945872664451599,
|
|
"num_tokens": 152405951.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 1.29453125,
|
|
"epoch": 0.22686893535602023,
|
|
"grad_norm": 0.12123098442839231,
|
|
"learning_rate": 4.570593208397915e-06,
|
|
"loss": 1.3099,
|
|
"mean_token_accuracy": 0.6914681673049927,
|
|
"num_tokens": 153325504.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 1.3203125,
|
|
"epoch": 0.22823561568949022,
|
|
"grad_norm": 0.11279177745682388,
|
|
"learning_rate": 4.567070593208398e-06,
|
|
"loss": 1.3169,
|
|
"mean_token_accuracy": 0.6899838626384736,
|
|
"num_tokens": 154226518.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 1.303125,
|
|
"epoch": 0.22960229602296023,
|
|
"grad_norm": 0.1304003411681263,
|
|
"learning_rate": 4.563547978018882e-06,
|
|
"loss": 1.3076,
|
|
"mean_token_accuracy": 0.6897183239459992,
|
|
"num_tokens": 155166706.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 1.2875,
|
|
"epoch": 0.23096897635643024,
|
|
"grad_norm": 0.10940387981628738,
|
|
"learning_rate": 4.560025362829365e-06,
|
|
"loss": 1.2895,
|
|
"mean_token_accuracy": 0.6918056666851043,
|
|
"num_tokens": 156080529.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.23233565668990022,
|
|
"grad_norm": 0.11276199181600362,
|
|
"learning_rate": 4.556502747639848e-06,
|
|
"loss": 1.3128,
|
|
"mean_token_accuracy": 0.6900627732276916,
|
|
"num_tokens": 157016731.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 1.33515625,
|
|
"epoch": 0.23370233702337023,
|
|
"grad_norm": 0.1123205426538198,
|
|
"learning_rate": 4.552980132450332e-06,
|
|
"loss": 1.3349,
|
|
"mean_token_accuracy": 0.6838991045951843,
|
|
"num_tokens": 157986519.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 1.29609375,
|
|
"epoch": 0.23506901735684024,
|
|
"grad_norm": 0.13710347734693734,
|
|
"learning_rate": 4.549457517260814e-06,
|
|
"loss": 1.3048,
|
|
"mean_token_accuracy": 0.6920367002487182,
|
|
"num_tokens": 158934641.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 1.3015625,
|
|
"epoch": 0.23643569769031023,
|
|
"grad_norm": 0.12146949659894289,
|
|
"learning_rate": 4.545934902071298e-06,
|
|
"loss": 1.3161,
|
|
"mean_token_accuracy": 0.6900604009628296,
|
|
"num_tokens": 159912974.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 1.340625,
|
|
"epoch": 0.23780237802378024,
|
|
"grad_norm": 0.13040551198856168,
|
|
"learning_rate": 4.5424122868817814e-06,
|
|
"loss": 1.3481,
|
|
"mean_token_accuracy": 0.6832473576068878,
|
|
"num_tokens": 160803757.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 1.2796875,
|
|
"epoch": 0.23916905835725025,
|
|
"grad_norm": 0.1193659375558908,
|
|
"learning_rate": 4.538889671692265e-06,
|
|
"loss": 1.2839,
|
|
"mean_token_accuracy": 0.6983261704444885,
|
|
"num_tokens": 161701172.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 1.3234375,
|
|
"epoch": 0.24053573869072023,
|
|
"grad_norm": 0.12158189496094203,
|
|
"learning_rate": 4.535367056502748e-06,
|
|
"loss": 1.3275,
|
|
"mean_token_accuracy": 0.6872871041297912,
|
|
"num_tokens": 162650956.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 1.31953125,
|
|
"epoch": 0.24190241902419024,
|
|
"grad_norm": 0.1161624064784002,
|
|
"learning_rate": 4.531844441313231e-06,
|
|
"loss": 1.3222,
|
|
"mean_token_accuracy": 0.688202953338623,
|
|
"num_tokens": 163571702.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 1.25859375,
|
|
"epoch": 0.24326909935766025,
|
|
"grad_norm": 0.10615894678931836,
|
|
"learning_rate": 4.528321826123715e-06,
|
|
"loss": 1.2666,
|
|
"mean_token_accuracy": 0.6976981639862061,
|
|
"num_tokens": 164471607.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 1.31015625,
|
|
"epoch": 0.24463577969113023,
|
|
"grad_norm": 0.10169806066888307,
|
|
"learning_rate": 4.524799210934198e-06,
|
|
"loss": 1.3149,
|
|
"mean_token_accuracy": 0.689883029460907,
|
|
"num_tokens": 165399554.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 1.3125,
|
|
"epoch": 0.24600246002460024,
|
|
"grad_norm": 0.10783323457551597,
|
|
"learning_rate": 4.521276595744681e-06,
|
|
"loss": 1.3092,
|
|
"mean_token_accuracy": 0.6923280239105225,
|
|
"num_tokens": 166291849.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 1.30234375,
|
|
"epoch": 0.24736914035807026,
|
|
"grad_norm": 0.17705820594974317,
|
|
"learning_rate": 4.5177539805551646e-06,
|
|
"loss": 1.3168,
|
|
"mean_token_accuracy": 0.6892471313476562,
|
|
"num_tokens": 167206313.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 1.2921875,
|
|
"epoch": 0.24873582069154024,
|
|
"grad_norm": 0.1145910223404135,
|
|
"learning_rate": 4.514231365365648e-06,
|
|
"loss": 1.3014,
|
|
"mean_token_accuracy": 0.692210179567337,
|
|
"num_tokens": 168088688.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 1.2890625,
|
|
"epoch": 0.25010250102501025,
|
|
"grad_norm": 0.14365066598597562,
|
|
"learning_rate": 4.510708750176131e-06,
|
|
"loss": 1.2802,
|
|
"mean_token_accuracy": 0.6950039148330689,
|
|
"num_tokens": 168991287.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 1.284375,
|
|
"epoch": 0.25146918135848023,
|
|
"grad_norm": 0.13098173972078386,
|
|
"learning_rate": 4.507186134986614e-06,
|
|
"loss": 1.2806,
|
|
"mean_token_accuracy": 0.6964347302913666,
|
|
"num_tokens": 169961051.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 1.30703125,
|
|
"epoch": 0.25283586169195027,
|
|
"grad_norm": 0.1274081116776796,
|
|
"learning_rate": 4.503663519797098e-06,
|
|
"loss": 1.3166,
|
|
"mean_token_accuracy": 0.6880217254161834,
|
|
"num_tokens": 170889299.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 1.2796875,
|
|
"epoch": 0.25420254202542025,
|
|
"grad_norm": 0.13593767714834198,
|
|
"learning_rate": 4.5001409046075814e-06,
|
|
"loss": 1.2862,
|
|
"mean_token_accuracy": 0.696414464712143,
|
|
"num_tokens": 171801507.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 1.28828125,
|
|
"epoch": 0.25556922235889024,
|
|
"grad_norm": 0.11771930780473085,
|
|
"learning_rate": 4.496618289418064e-06,
|
|
"loss": 1.2844,
|
|
"mean_token_accuracy": 0.6945802927017212,
|
|
"num_tokens": 172756808.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 1.27578125,
|
|
"epoch": 0.2569359026923603,
|
|
"grad_norm": 0.13197725143149885,
|
|
"learning_rate": 4.493095674228548e-06,
|
|
"loss": 1.2805,
|
|
"mean_token_accuracy": 0.6968704640865326,
|
|
"num_tokens": 173650984.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 1.346875,
|
|
"epoch": 0.25830258302583026,
|
|
"grad_norm": 0.11860302640553821,
|
|
"learning_rate": 4.48957305903903e-06,
|
|
"loss": 1.3439,
|
|
"mean_token_accuracy": 0.6844312667846679,
|
|
"num_tokens": 174590986.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 1.3,
|
|
"epoch": 0.25966926335930024,
|
|
"grad_norm": 0.11211263436916198,
|
|
"learning_rate": 4.486050443849515e-06,
|
|
"loss": 1.3059,
|
|
"mean_token_accuracy": 0.6933001279830933,
|
|
"num_tokens": 175502905.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 1.28203125,
|
|
"epoch": 0.2610359436927703,
|
|
"grad_norm": 0.11177367369502672,
|
|
"learning_rate": 4.4825278286599975e-06,
|
|
"loss": 1.2905,
|
|
"mean_token_accuracy": 0.6938953459262848,
|
|
"num_tokens": 176434918.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 1.265625,
|
|
"epoch": 0.26240262402624026,
|
|
"grad_norm": 0.13352315044656032,
|
|
"learning_rate": 4.479005213470481e-06,
|
|
"loss": 1.2706,
|
|
"mean_token_accuracy": 0.6995628952980042,
|
|
"num_tokens": 177351387.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 1.30546875,
|
|
"epoch": 0.26376930435971024,
|
|
"grad_norm": 0.12808629840270644,
|
|
"learning_rate": 4.475482598280964e-06,
|
|
"loss": 1.3019,
|
|
"mean_token_accuracy": 0.6890412092208862,
|
|
"num_tokens": 178273072.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.2651359846931803,
|
|
"grad_norm": 0.10792351264063485,
|
|
"learning_rate": 4.471959983091447e-06,
|
|
"loss": 1.2788,
|
|
"mean_token_accuracy": 0.6965969800949097,
|
|
"num_tokens": 179201344.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.26650266502665027,
|
|
"grad_norm": 0.11742711013648705,
|
|
"learning_rate": 4.468437367901931e-06,
|
|
"loss": 1.3105,
|
|
"mean_token_accuracy": 0.6918510138988495,
|
|
"num_tokens": 180128488.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.26786934536012025,
|
|
"grad_norm": 0.12618018874948397,
|
|
"learning_rate": 4.464914752712414e-06,
|
|
"loss": 1.3217,
|
|
"mean_token_accuracy": 0.6894722759723664,
|
|
"num_tokens": 181044049.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 1.2703125,
|
|
"epoch": 0.2692360256935903,
|
|
"grad_norm": 0.13078351398568738,
|
|
"learning_rate": 4.461392137522897e-06,
|
|
"loss": 1.2776,
|
|
"mean_token_accuracy": 0.695350193977356,
|
|
"num_tokens": 181988674.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.27060270602706027,
|
|
"grad_norm": 0.13722698118502852,
|
|
"learning_rate": 4.457869522333381e-06,
|
|
"loss": 1.2592,
|
|
"mean_token_accuracy": 0.6998622715473175,
|
|
"num_tokens": 182885151.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 1.284375,
|
|
"epoch": 0.27196938636053025,
|
|
"grad_norm": 0.1281120333564413,
|
|
"learning_rate": 4.454346907143864e-06,
|
|
"loss": 1.2918,
|
|
"mean_token_accuracy": 0.6966606914997101,
|
|
"num_tokens": 183799094.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 1.32109375,
|
|
"epoch": 0.2733360666940003,
|
|
"grad_norm": 0.14807743689938463,
|
|
"learning_rate": 4.450824291954347e-06,
|
|
"loss": 1.3284,
|
|
"mean_token_accuracy": 0.6876100718975067,
|
|
"num_tokens": 184737383.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.2747027470274703,
|
|
"grad_norm": 0.12358712177283616,
|
|
"learning_rate": 4.44730167676483e-06,
|
|
"loss": 1.2691,
|
|
"mean_token_accuracy": 0.6997144818305969,
|
|
"num_tokens": 185666325.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 1.334375,
|
|
"epoch": 0.27606942736094026,
|
|
"grad_norm": 0.12356540936282172,
|
|
"learning_rate": 4.443779061575314e-06,
|
|
"loss": 1.3403,
|
|
"mean_token_accuracy": 0.6832573473453522,
|
|
"num_tokens": 186596675.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.2774361076944103,
|
|
"grad_norm": 0.13658668993910775,
|
|
"learning_rate": 4.4402564463857975e-06,
|
|
"loss": 1.2379,
|
|
"mean_token_accuracy": 0.7007066369056701,
|
|
"num_tokens": 187482907.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 1.29765625,
|
|
"epoch": 0.2788027880278803,
|
|
"grad_norm": 0.12650305045179955,
|
|
"learning_rate": 4.43673383119628e-06,
|
|
"loss": 1.3265,
|
|
"mean_token_accuracy": 0.686937290430069,
|
|
"num_tokens": 188385774.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.28016946836135026,
|
|
"grad_norm": 0.14676450805947974,
|
|
"learning_rate": 4.433211216006764e-06,
|
|
"loss": 1.3116,
|
|
"mean_token_accuracy": 0.6900434792041779,
|
|
"num_tokens": 189277151.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.2815361486948203,
|
|
"grad_norm": 0.12703639124735772,
|
|
"learning_rate": 4.4296886008172465e-06,
|
|
"loss": 1.273,
|
|
"mean_token_accuracy": 0.6957660675048828,
|
|
"num_tokens": 190184956.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 1.2734375,
|
|
"epoch": 0.2829028290282903,
|
|
"grad_norm": 0.11918830070940434,
|
|
"learning_rate": 4.426165985627731e-06,
|
|
"loss": 1.2818,
|
|
"mean_token_accuracy": 0.6965542018413544,
|
|
"num_tokens": 191146194.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 1.2546875,
|
|
"epoch": 0.28426950936176026,
|
|
"grad_norm": 0.12346494355358226,
|
|
"learning_rate": 4.4226433704382136e-06,
|
|
"loss": 1.2557,
|
|
"mean_token_accuracy": 0.7016662359237671,
|
|
"num_tokens": 192033840.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 1.2703125,
|
|
"epoch": 0.2856361896952303,
|
|
"grad_norm": 0.14567003630646855,
|
|
"learning_rate": 4.419120755248697e-06,
|
|
"loss": 1.2762,
|
|
"mean_token_accuracy": 0.6963268220424652,
|
|
"num_tokens": 192955176.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 1.3390625,
|
|
"epoch": 0.2870028700287003,
|
|
"grad_norm": 0.11834226531009458,
|
|
"learning_rate": 4.41559814005918e-06,
|
|
"loss": 1.3613,
|
|
"mean_token_accuracy": 0.6816338956356048,
|
|
"num_tokens": 193892359.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 1.27421875,
|
|
"epoch": 0.28836955036217027,
|
|
"grad_norm": 0.21025543104878225,
|
|
"learning_rate": 4.412075524869663e-06,
|
|
"loss": 1.2748,
|
|
"mean_token_accuracy": 0.6982753038406372,
|
|
"num_tokens": 194764643.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 1.2890625,
|
|
"epoch": 0.2897362306956403,
|
|
"grad_norm": 0.11298909710540267,
|
|
"learning_rate": 4.408552909680147e-06,
|
|
"loss": 1.2931,
|
|
"mean_token_accuracy": 0.695224529504776,
|
|
"num_tokens": 195672996.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.2911029110291103,
|
|
"grad_norm": 0.1160780347894177,
|
|
"learning_rate": 4.4050302944906304e-06,
|
|
"loss": 1.3083,
|
|
"mean_token_accuracy": 0.6908288896083832,
|
|
"num_tokens": 196608934.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 1.29140625,
|
|
"epoch": 0.2924695913625803,
|
|
"grad_norm": 0.11677578834103958,
|
|
"learning_rate": 4.401507679301113e-06,
|
|
"loss": 1.2878,
|
|
"mean_token_accuracy": 0.6932227075099945,
|
|
"num_tokens": 197498404.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 1.34453125,
|
|
"epoch": 0.2938362716960503,
|
|
"grad_norm": 0.13114900751469802,
|
|
"learning_rate": 4.397985064111597e-06,
|
|
"loss": 1.3585,
|
|
"mean_token_accuracy": 0.6822027564048767,
|
|
"num_tokens": 198454218.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 1.29140625,
|
|
"epoch": 0.2952029520295203,
|
|
"grad_norm": 0.160853699748396,
|
|
"learning_rate": 4.39446244892208e-06,
|
|
"loss": 1.2907,
|
|
"mean_token_accuracy": 0.693654203414917,
|
|
"num_tokens": 199374348.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 1.30390625,
|
|
"epoch": 0.2965696323629903,
|
|
"grad_norm": 0.12308220059919121,
|
|
"learning_rate": 4.390939833732563e-06,
|
|
"loss": 1.3093,
|
|
"mean_token_accuracy": 0.6904061734676361,
|
|
"num_tokens": 200221659.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 1.28828125,
|
|
"epoch": 0.2979363126964603,
|
|
"grad_norm": 0.13551784163794225,
|
|
"learning_rate": 4.387417218543047e-06,
|
|
"loss": 1.2955,
|
|
"mean_token_accuracy": 0.6929843306541443,
|
|
"num_tokens": 201155187.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.2993029930299303,
|
|
"grad_norm": 0.13296809882994823,
|
|
"learning_rate": 4.38389460335353e-06,
|
|
"loss": 1.2392,
|
|
"mean_token_accuracy": 0.703128844499588,
|
|
"num_tokens": 202067534.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.3006696733634003,
|
|
"grad_norm": 0.1106639036297065,
|
|
"learning_rate": 4.380371988164014e-06,
|
|
"loss": 1.3068,
|
|
"mean_token_accuracy": 0.6892962515354156,
|
|
"num_tokens": 203044126.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 1.278125,
|
|
"epoch": 0.3020363536968703,
|
|
"grad_norm": 0.129528231726814,
|
|
"learning_rate": 4.376849372974496e-06,
|
|
"loss": 1.2895,
|
|
"mean_token_accuracy": 0.696237176656723,
|
|
"num_tokens": 203996905.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.3034030340303403,
|
|
"grad_norm": 0.11656862085456472,
|
|
"learning_rate": 4.37332675778498e-06,
|
|
"loss": 1.2596,
|
|
"mean_token_accuracy": 0.6979257345199585,
|
|
"num_tokens": 204929875.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.3047697143638103,
|
|
"grad_norm": 0.1108721570089439,
|
|
"learning_rate": 4.369804142595463e-06,
|
|
"loss": 1.2776,
|
|
"mean_token_accuracy": 0.697523432970047,
|
|
"num_tokens": 205839908.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 1.28359375,
|
|
"epoch": 0.3061363946972803,
|
|
"grad_norm": 0.11245347079609937,
|
|
"learning_rate": 4.366281527405947e-06,
|
|
"loss": 1.2913,
|
|
"mean_token_accuracy": 0.6937489032745361,
|
|
"num_tokens": 206800232.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.3075030750307503,
|
|
"grad_norm": 0.16307737681670748,
|
|
"learning_rate": 4.36275891221643e-06,
|
|
"loss": 1.2345,
|
|
"mean_token_accuracy": 0.7008326530456543,
|
|
"num_tokens": 207736757.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.3088697553642203,
|
|
"grad_norm": 0.1173917685136567,
|
|
"learning_rate": 4.359236297026913e-06,
|
|
"loss": 1.2602,
|
|
"mean_token_accuracy": 0.6997781217098236,
|
|
"num_tokens": 208650386.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.3102364356976903,
|
|
"grad_norm": 0.13744884320738784,
|
|
"learning_rate": 4.355713681837396e-06,
|
|
"loss": 1.2441,
|
|
"mean_token_accuracy": 0.7013028502464295,
|
|
"num_tokens": 209561366.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.3116031160311603,
|
|
"grad_norm": 0.10752187831601055,
|
|
"learning_rate": 4.352191066647879e-06,
|
|
"loss": 1.2775,
|
|
"mean_token_accuracy": 0.6958353698253632,
|
|
"num_tokens": 210471900.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.3129697963646303,
|
|
"grad_norm": 0.12687113696309119,
|
|
"learning_rate": 4.348668451458363e-06,
|
|
"loss": 1.2632,
|
|
"mean_token_accuracy": 0.7001063466072083,
|
|
"num_tokens": 211400549.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 1.3203125,
|
|
"epoch": 0.31433647669810033,
|
|
"grad_norm": 0.1308350784071019,
|
|
"learning_rate": 4.3451458362688465e-06,
|
|
"loss": 1.3221,
|
|
"mean_token_accuracy": 0.6884016871452332,
|
|
"num_tokens": 212322958.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 1.2796875,
|
|
"epoch": 0.3157031570315703,
|
|
"grad_norm": 0.11560892565941425,
|
|
"learning_rate": 4.34162322107933e-06,
|
|
"loss": 1.2747,
|
|
"mean_token_accuracy": 0.6982922971248626,
|
|
"num_tokens": 213218481.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 1.29140625,
|
|
"epoch": 0.3170698373650403,
|
|
"grad_norm": 0.11588237904650757,
|
|
"learning_rate": 4.338100605889813e-06,
|
|
"loss": 1.2889,
|
|
"mean_token_accuracy": 0.6951510965824127,
|
|
"num_tokens": 214104458.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.31843651769851034,
|
|
"grad_norm": 0.12456513167586997,
|
|
"learning_rate": 4.334577990700296e-06,
|
|
"loss": 1.2256,
|
|
"mean_token_accuracy": 0.7081618905067444,
|
|
"num_tokens": 214997474.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 1.29453125,
|
|
"epoch": 0.3198031980319803,
|
|
"grad_norm": 0.1393854894008662,
|
|
"learning_rate": 4.33105537551078e-06,
|
|
"loss": 1.3058,
|
|
"mean_token_accuracy": 0.6891490280628204,
|
|
"num_tokens": 215898945.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.3211698783654503,
|
|
"grad_norm": 0.12110931639737384,
|
|
"learning_rate": 4.327532760321263e-06,
|
|
"loss": 1.2442,
|
|
"mean_token_accuracy": 0.704649168252945,
|
|
"num_tokens": 216780044.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.32253655869892034,
|
|
"grad_norm": 0.13037133307716994,
|
|
"learning_rate": 4.324010145131746e-06,
|
|
"loss": 1.2616,
|
|
"mean_token_accuracy": 0.6965738534927368,
|
|
"num_tokens": 217686514.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 1.28046875,
|
|
"epoch": 0.3239032390323903,
|
|
"grad_norm": 0.1102080909369107,
|
|
"learning_rate": 4.32048752994223e-06,
|
|
"loss": 1.2971,
|
|
"mean_token_accuracy": 0.6929846584796906,
|
|
"num_tokens": 218646157.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 1.290625,
|
|
"epoch": 0.3252699193658603,
|
|
"grad_norm": 0.11310121415863894,
|
|
"learning_rate": 4.316964914752712e-06,
|
|
"loss": 1.2983,
|
|
"mean_token_accuracy": 0.6945104002952576,
|
|
"num_tokens": 219565771.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 1.290625,
|
|
"epoch": 0.32663659969933034,
|
|
"grad_norm": 0.13054145268175907,
|
|
"learning_rate": 4.313442299563196e-06,
|
|
"loss": 1.2971,
|
|
"mean_token_accuracy": 0.6952828586101532,
|
|
"num_tokens": 220446709.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.3280032800328003,
|
|
"grad_norm": 0.13307442953470408,
|
|
"learning_rate": 4.3099196843736794e-06,
|
|
"loss": 1.2748,
|
|
"mean_token_accuracy": 0.6965203583240509,
|
|
"num_tokens": 221335909.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.3293699603662703,
|
|
"grad_norm": 0.11046789508673516,
|
|
"learning_rate": 4.306397069184163e-06,
|
|
"loss": 1.2694,
|
|
"mean_token_accuracy": 0.6959333658218384,
|
|
"num_tokens": 222251385.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 1.29921875,
|
|
"epoch": 0.33073664069974035,
|
|
"grad_norm": 0.1300628713443655,
|
|
"learning_rate": 4.302874453994646e-06,
|
|
"loss": 1.3041,
|
|
"mean_token_accuracy": 0.6904434621334076,
|
|
"num_tokens": 223160354.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.33210332103321033,
|
|
"grad_norm": 0.1103830865159034,
|
|
"learning_rate": 4.299351838805129e-06,
|
|
"loss": 1.2676,
|
|
"mean_token_accuracy": 0.6983302354812622,
|
|
"num_tokens": 224079742.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 1.27890625,
|
|
"epoch": 0.3334700013666803,
|
|
"grad_norm": 0.11476649919545955,
|
|
"learning_rate": 4.295829223615613e-06,
|
|
"loss": 1.2876,
|
|
"mean_token_accuracy": 0.6952979445457459,
|
|
"num_tokens": 225006893.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 1.3125,
|
|
"epoch": 0.33483668170015035,
|
|
"grad_norm": 0.11646686360926647,
|
|
"learning_rate": 4.292306608426096e-06,
|
|
"loss": 1.3002,
|
|
"mean_token_accuracy": 0.6930657207965851,
|
|
"num_tokens": 225912700.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.33620336203362033,
|
|
"grad_norm": 0.12280188297544078,
|
|
"learning_rate": 4.288783993236579e-06,
|
|
"loss": 1.231,
|
|
"mean_token_accuracy": 0.7040263235569,
|
|
"num_tokens": 226846501.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 1.28359375,
|
|
"epoch": 0.3375700423670903,
|
|
"grad_norm": 0.11533769838187642,
|
|
"learning_rate": 4.285261378047063e-06,
|
|
"loss": 1.2812,
|
|
"mean_token_accuracy": 0.6964476704597473,
|
|
"num_tokens": 227775064.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 1.2859375,
|
|
"epoch": 0.33893672270056036,
|
|
"grad_norm": 0.12050359315526839,
|
|
"learning_rate": 4.281738762857546e-06,
|
|
"loss": 1.2977,
|
|
"mean_token_accuracy": 0.6898184597492218,
|
|
"num_tokens": 228705788.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 1.32421875,
|
|
"epoch": 0.34030340303403034,
|
|
"grad_norm": 0.13121238377045613,
|
|
"learning_rate": 4.278216147668029e-06,
|
|
"loss": 1.3273,
|
|
"mean_token_accuracy": 0.6868832111358643,
|
|
"num_tokens": 229637091.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 1.26484375,
|
|
"epoch": 0.3416700833675003,
|
|
"grad_norm": 0.12421725675594202,
|
|
"learning_rate": 4.274693532478512e-06,
|
|
"loss": 1.2732,
|
|
"mean_token_accuracy": 0.6960365235805511,
|
|
"num_tokens": 230557197.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 0.34303676370097036,
|
|
"grad_norm": 0.10690843923579096,
|
|
"learning_rate": 4.271170917288996e-06,
|
|
"loss": 1.2378,
|
|
"mean_token_accuracy": 0.7052179098129272,
|
|
"num_tokens": 231469955.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.34440344403444034,
|
|
"grad_norm": 0.11459973358390473,
|
|
"learning_rate": 4.2676483020994795e-06,
|
|
"loss": 1.2622,
|
|
"mean_token_accuracy": 0.7000189423561096,
|
|
"num_tokens": 232397901.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 1.31171875,
|
|
"epoch": 0.3457701243679103,
|
|
"grad_norm": 0.1148522143088296,
|
|
"learning_rate": 4.264125686909962e-06,
|
|
"loss": 1.3286,
|
|
"mean_token_accuracy": 0.6882340133190155,
|
|
"num_tokens": 233361773.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 1.278125,
|
|
"epoch": 0.34713680470138036,
|
|
"grad_norm": 0.12369351514476203,
|
|
"learning_rate": 4.260603071720446e-06,
|
|
"loss": 1.2899,
|
|
"mean_token_accuracy": 0.6957898378372193,
|
|
"num_tokens": 234268527.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 1.28515625,
|
|
"epoch": 0.34850348503485035,
|
|
"grad_norm": 0.12889801149537766,
|
|
"learning_rate": 4.257080456530928e-06,
|
|
"loss": 1.2905,
|
|
"mean_token_accuracy": 0.6944907069206238,
|
|
"num_tokens": 235186978.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.34987016536832033,
|
|
"grad_norm": 0.11243462446932824,
|
|
"learning_rate": 4.253557841341413e-06,
|
|
"loss": 1.2236,
|
|
"mean_token_accuracy": 0.7048734545707702,
|
|
"num_tokens": 236119762.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 1.27265625,
|
|
"epoch": 0.35123684570179037,
|
|
"grad_norm": 0.19039497288242777,
|
|
"learning_rate": 4.2500352261518955e-06,
|
|
"loss": 1.2707,
|
|
"mean_token_accuracy": 0.6961425244808197,
|
|
"num_tokens": 237045069.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 0.35260352603526035,
|
|
"grad_norm": 0.12164270264953554,
|
|
"learning_rate": 4.246512610962379e-06,
|
|
"loss": 1.2158,
|
|
"mean_token_accuracy": 0.7082364320755005,
|
|
"num_tokens": 237966003.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.35397020636873033,
|
|
"grad_norm": 0.11681577863039247,
|
|
"learning_rate": 4.242989995772862e-06,
|
|
"loss": 1.2664,
|
|
"mean_token_accuracy": 0.6984503924846649,
|
|
"num_tokens": 238847792.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 1.296875,
|
|
"epoch": 0.35533688670220037,
|
|
"grad_norm": 0.11173877373186784,
|
|
"learning_rate": 4.239467380583345e-06,
|
|
"loss": 1.3077,
|
|
"mean_token_accuracy": 0.6906331598758697,
|
|
"num_tokens": 239817460.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.35670356703567035,
|
|
"grad_norm": 0.11339189951878077,
|
|
"learning_rate": 4.235944765393829e-06,
|
|
"loss": 1.2338,
|
|
"mean_token_accuracy": 0.703789210319519,
|
|
"num_tokens": 240730875.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.35807024736914034,
|
|
"grad_norm": 0.3368006360903974,
|
|
"learning_rate": 4.232422150204312e-06,
|
|
"loss": 1.2894,
|
|
"mean_token_accuracy": 0.6908616781234741,
|
|
"num_tokens": 241618927.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 1.33046875,
|
|
"epoch": 0.3594369277026104,
|
|
"grad_norm": 0.11504662187471855,
|
|
"learning_rate": 4.228899535014795e-06,
|
|
"loss": 1.3354,
|
|
"mean_token_accuracy": 0.6852052867412567,
|
|
"num_tokens": 242506323.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.36080360803608036,
|
|
"grad_norm": 0.15077345098264827,
|
|
"learning_rate": 4.225376919825279e-06,
|
|
"loss": 1.2706,
|
|
"mean_token_accuracy": 0.6977900922298431,
|
|
"num_tokens": 243419790.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.36217028836955034,
|
|
"grad_norm": 0.12104153392246163,
|
|
"learning_rate": 4.221854304635762e-06,
|
|
"loss": 1.2589,
|
|
"mean_token_accuracy": 0.6975282967090607,
|
|
"num_tokens": 244337605.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 1.31484375,
|
|
"epoch": 0.3635369687030204,
|
|
"grad_norm": 0.11908128594658098,
|
|
"learning_rate": 4.218331689446245e-06,
|
|
"loss": 1.3308,
|
|
"mean_token_accuracy": 0.6857661306858063,
|
|
"num_tokens": 245269588.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.36490364903649036,
|
|
"grad_norm": 0.14196866150717374,
|
|
"learning_rate": 4.2148090742567284e-06,
|
|
"loss": 1.2365,
|
|
"mean_token_accuracy": 0.7050193846225739,
|
|
"num_tokens": 246183522.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.36627032936996035,
|
|
"grad_norm": 0.1125412918701911,
|
|
"learning_rate": 4.211286459067212e-06,
|
|
"loss": 1.2916,
|
|
"mean_token_accuracy": 0.6902585327625275,
|
|
"num_tokens": 247159185.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.3676370097034304,
|
|
"grad_norm": 0.1069411229900819,
|
|
"learning_rate": 4.2077638438776955e-06,
|
|
"loss": 1.2124,
|
|
"mean_token_accuracy": 0.7089369416236877,
|
|
"num_tokens": 248044696.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 0.36900369003690037,
|
|
"grad_norm": 0.12324321391962742,
|
|
"learning_rate": 4.204241228688178e-06,
|
|
"loss": 1.2452,
|
|
"mean_token_accuracy": 0.7030347645282745,
|
|
"num_tokens": 249045044.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 0.13881611733662494,
|
|
"learning_rate": 4.200718613498662e-06,
|
|
"loss": 1.2528,
|
|
"mean_token_accuracy": 0.7012747704982758,
|
|
"num_tokens": 249974276.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.3717370507038404,
|
|
"grad_norm": 0.13051151234951344,
|
|
"learning_rate": 4.1971959983091445e-06,
|
|
"loss": 1.2619,
|
|
"mean_token_accuracy": 0.7001829504966736,
|
|
"num_tokens": 250873544.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.37310373103731037,
|
|
"grad_norm": 0.12004311285603303,
|
|
"learning_rate": 4.193673383119629e-06,
|
|
"loss": 1.2303,
|
|
"mean_token_accuracy": 0.7044033169746399,
|
|
"num_tokens": 251819879.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 1.33671875,
|
|
"epoch": 0.37447041137078035,
|
|
"grad_norm": 0.11020555779222778,
|
|
"learning_rate": 4.190150767930112e-06,
|
|
"loss": 1.3409,
|
|
"mean_token_accuracy": 0.6853121519088745,
|
|
"num_tokens": 252807631.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 1.31875,
|
|
"epoch": 0.3758370917042504,
|
|
"grad_norm": 0.11148832446302709,
|
|
"learning_rate": 4.186628152740595e-06,
|
|
"loss": 1.3246,
|
|
"mean_token_accuracy": 0.6881563842296601,
|
|
"num_tokens": 253747908.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.3772037720377204,
|
|
"grad_norm": 0.11947725181236096,
|
|
"learning_rate": 4.183105537551078e-06,
|
|
"loss": 1.2549,
|
|
"mean_token_accuracy": 0.6997751951217651,
|
|
"num_tokens": 254680625.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 1.3015625,
|
|
"epoch": 0.37857045237119036,
|
|
"grad_norm": 0.11661561803888455,
|
|
"learning_rate": 4.179582922361561e-06,
|
|
"loss": 1.315,
|
|
"mean_token_accuracy": 0.687542873620987,
|
|
"num_tokens": 255573141.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 1.29375,
|
|
"epoch": 0.3799371327046604,
|
|
"grad_norm": 0.11348552853117055,
|
|
"learning_rate": 4.176060307172045e-06,
|
|
"loss": 1.3012,
|
|
"mean_token_accuracy": 0.6913858711719513,
|
|
"num_tokens": 256517622.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 1.2578125,
|
|
"epoch": 0.3813038130381304,
|
|
"grad_norm": 0.11361417147858806,
|
|
"learning_rate": 4.1725376919825285e-06,
|
|
"loss": 1.2589,
|
|
"mean_token_accuracy": 0.7002162039279938,
|
|
"num_tokens": 257406301.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 1.26015625,
|
|
"epoch": 0.38267049337160036,
|
|
"grad_norm": 0.1192304741579292,
|
|
"learning_rate": 4.169015076793011e-06,
|
|
"loss": 1.2603,
|
|
"mean_token_accuracy": 0.69959557056427,
|
|
"num_tokens": 258296600.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.3840371737050704,
|
|
"grad_norm": 0.13659441192455654,
|
|
"learning_rate": 4.165492461603495e-06,
|
|
"loss": 1.2303,
|
|
"mean_token_accuracy": 0.7036224126815795,
|
|
"num_tokens": 259223724.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.3854038540385404,
|
|
"grad_norm": 0.15087681652623133,
|
|
"learning_rate": 4.161969846413978e-06,
|
|
"loss": 1.2294,
|
|
"mean_token_accuracy": 0.7048193097114563,
|
|
"num_tokens": 260114709.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 0.38677053437201037,
|
|
"grad_norm": 0.11137116676263915,
|
|
"learning_rate": 4.158447231224461e-06,
|
|
"loss": 1.2026,
|
|
"mean_token_accuracy": 0.7096366822719574,
|
|
"num_tokens": 261095644.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 1.271875,
|
|
"epoch": 0.3881372147054804,
|
|
"grad_norm": 0.13660727391696498,
|
|
"learning_rate": 4.1549246160349445e-06,
|
|
"loss": 1.2725,
|
|
"mean_token_accuracy": 0.6964371144771576,
|
|
"num_tokens": 262062947.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.3895038950389504,
|
|
"grad_norm": 0.1109868148826064,
|
|
"learning_rate": 4.151402000845428e-06,
|
|
"loss": 1.2811,
|
|
"mean_token_accuracy": 0.6948269367218017,
|
|
"num_tokens": 263008190.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 0.39087057537242037,
|
|
"grad_norm": 0.10930896996531131,
|
|
"learning_rate": 4.147879385655912e-06,
|
|
"loss": 1.2266,
|
|
"mean_token_accuracy": 0.7060554087162018,
|
|
"num_tokens": 263918362.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.3922372557058904,
|
|
"grad_norm": 0.14735282971681,
|
|
"learning_rate": 4.144356770466394e-06,
|
|
"loss": 1.2377,
|
|
"mean_token_accuracy": 0.6997297942638397,
|
|
"num_tokens": 264804628.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 1.32578125,
|
|
"epoch": 0.3936039360393604,
|
|
"grad_norm": 0.12547858997241734,
|
|
"learning_rate": 4.140834155276878e-06,
|
|
"loss": 1.3233,
|
|
"mean_token_accuracy": 0.688546109199524,
|
|
"num_tokens": 265741733.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 1.28125,
|
|
"epoch": 0.3949706163728304,
|
|
"grad_norm": 0.13798791356386056,
|
|
"learning_rate": 4.137311540087361e-06,
|
|
"loss": 1.29,
|
|
"mean_token_accuracy": 0.696085637807846,
|
|
"num_tokens": 266715918.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 1.30546875,
|
|
"epoch": 0.3963372967063004,
|
|
"grad_norm": 0.12580697741053476,
|
|
"learning_rate": 4.133788924897845e-06,
|
|
"loss": 1.2955,
|
|
"mean_token_accuracy": 0.693737131357193,
|
|
"num_tokens": 267644209.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 0.3977039770397704,
|
|
"grad_norm": 0.11861293058590129,
|
|
"learning_rate": 4.130266309708328e-06,
|
|
"loss": 1.2192,
|
|
"mean_token_accuracy": 0.7094938576221466,
|
|
"num_tokens": 268556439.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 1.30234375,
|
|
"epoch": 0.3990706573732404,
|
|
"grad_norm": 0.12252509092990736,
|
|
"learning_rate": 4.126743694518811e-06,
|
|
"loss": 1.3019,
|
|
"mean_token_accuracy": 0.6912787973880767,
|
|
"num_tokens": 269504566.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 1.2421875,
|
|
"epoch": 0.4004373377067104,
|
|
"grad_norm": 0.11438029024669193,
|
|
"learning_rate": 4.123221079329294e-06,
|
|
"loss": 1.2444,
|
|
"mean_token_accuracy": 0.7009967744350434,
|
|
"num_tokens": 270367741.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 1.28359375,
|
|
"epoch": 0.4018040180401804,
|
|
"grad_norm": 0.1529209322762558,
|
|
"learning_rate": 4.1196984641397774e-06,
|
|
"loss": 1.2856,
|
|
"mean_token_accuracy": 0.6936851799488067,
|
|
"num_tokens": 271271744.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.4031706983736504,
|
|
"grad_norm": 0.12087743118783123,
|
|
"learning_rate": 4.116175848950261e-06,
|
|
"loss": 1.225,
|
|
"mean_token_accuracy": 0.7090462625026703,
|
|
"num_tokens": 272171745.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.4045373787071204,
|
|
"grad_norm": 0.1521052865335584,
|
|
"learning_rate": 4.1126532337607445e-06,
|
|
"loss": 1.2268,
|
|
"mean_token_accuracy": 0.7047896325588227,
|
|
"num_tokens": 273016295.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 1.278125,
|
|
"epoch": 0.4059040590405904,
|
|
"grad_norm": 0.15406087897171142,
|
|
"learning_rate": 4.109130618571227e-06,
|
|
"loss": 1.2817,
|
|
"mean_token_accuracy": 0.6963451564311981,
|
|
"num_tokens": 273956028.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 1.26640625,
|
|
"epoch": 0.4072707393740604,
|
|
"grad_norm": 0.18127663089398183,
|
|
"learning_rate": 4.105608003381711e-06,
|
|
"loss": 1.271,
|
|
"mean_token_accuracy": 0.6924862325191498,
|
|
"num_tokens": 274864033.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.4086374197075304,
|
|
"grad_norm": 0.21382664920677885,
|
|
"learning_rate": 4.102085388192194e-06,
|
|
"loss": 1.2257,
|
|
"mean_token_accuracy": 0.7071579754352569,
|
|
"num_tokens": 275771714.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.4100041000410004,
|
|
"grad_norm": 0.12203740994950994,
|
|
"learning_rate": 4.098562773002678e-06,
|
|
"loss": 1.268,
|
|
"mean_token_accuracy": 0.6932590067386627,
|
|
"num_tokens": 276671468.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.4113707803744704,
|
|
"grad_norm": 0.11088384994869899,
|
|
"learning_rate": 4.095040157813161e-06,
|
|
"loss": 1.2466,
|
|
"mean_token_accuracy": 0.7026635229587554,
|
|
"num_tokens": 277629191.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 1.28359375,
|
|
"epoch": 0.41273746070794043,
|
|
"grad_norm": 0.1194122100165308,
|
|
"learning_rate": 4.091517542623644e-06,
|
|
"loss": 1.2986,
|
|
"mean_token_accuracy": 0.6923550128936767,
|
|
"num_tokens": 278530380.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 1.303125,
|
|
"epoch": 0.4141041410414104,
|
|
"grad_norm": 0.12290406340074199,
|
|
"learning_rate": 4.087994927434128e-06,
|
|
"loss": 1.3203,
|
|
"mean_token_accuracy": 0.6876026511192321,
|
|
"num_tokens": 279444562.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 1.2671875,
|
|
"epoch": 0.4154708213748804,
|
|
"grad_norm": 0.12278463491335583,
|
|
"learning_rate": 4.08447231224461e-06,
|
|
"loss": 1.2773,
|
|
"mean_token_accuracy": 0.6950074672698975,
|
|
"num_tokens": 280334745.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.41683750170835043,
|
|
"grad_norm": 0.11578972146361072,
|
|
"learning_rate": 4.080949697055094e-06,
|
|
"loss": 1.2152,
|
|
"mean_token_accuracy": 0.7067115724086761,
|
|
"num_tokens": 281213602.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.4182041820418204,
|
|
"grad_norm": 0.11482196221299577,
|
|
"learning_rate": 4.0774270818655775e-06,
|
|
"loss": 1.2461,
|
|
"mean_token_accuracy": 0.7034240663051605,
|
|
"num_tokens": 282155704.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 1.2875,
|
|
"epoch": 0.4195708623752904,
|
|
"grad_norm": 0.11676758584596839,
|
|
"learning_rate": 4.073904466676061e-06,
|
|
"loss": 1.2933,
|
|
"mean_token_accuracy": 0.69227836728096,
|
|
"num_tokens": 283060115.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.42093754270876044,
|
|
"grad_norm": 0.13889990025112586,
|
|
"learning_rate": 4.070381851486544e-06,
|
|
"loss": 1.2748,
|
|
"mean_token_accuracy": 0.7000295758247376,
|
|
"num_tokens": 284015383.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.4223042230422304,
|
|
"grad_norm": 0.11349205165760369,
|
|
"learning_rate": 4.066859236297027e-06,
|
|
"loss": 1.263,
|
|
"mean_token_accuracy": 0.7005176544189453,
|
|
"num_tokens": 284957286.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 1.26640625,
|
|
"epoch": 0.4236709033757004,
|
|
"grad_norm": 0.11889528554779359,
|
|
"learning_rate": 4.06333662110751e-06,
|
|
"loss": 1.2668,
|
|
"mean_token_accuracy": 0.7004799962043762,
|
|
"num_tokens": 285849739.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 1.3203125,
|
|
"epoch": 0.42503758370917044,
|
|
"grad_norm": 0.13055415877214335,
|
|
"learning_rate": 4.059814005917994e-06,
|
|
"loss": 1.3351,
|
|
"mean_token_accuracy": 0.6881525635719299,
|
|
"num_tokens": 286816194.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 1.26171875,
|
|
"epoch": 0.4264042640426404,
|
|
"grad_norm": 0.13089566008878434,
|
|
"learning_rate": 4.056291390728477e-06,
|
|
"loss": 1.2618,
|
|
"mean_token_accuracy": 0.7006569504737854,
|
|
"num_tokens": 287733305.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.4277709443761104,
|
|
"grad_norm": 0.1363352718814213,
|
|
"learning_rate": 4.052768775538961e-06,
|
|
"loss": 1.3118,
|
|
"mean_token_accuracy": 0.6927517831325531,
|
|
"num_tokens": 288676265.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 1.26171875,
|
|
"epoch": 0.42913762470958045,
|
|
"grad_norm": 0.1195214896900436,
|
|
"learning_rate": 4.049246160349444e-06,
|
|
"loss": 1.2662,
|
|
"mean_token_accuracy": 0.6999547719955445,
|
|
"num_tokens": 289628921.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.43050430504305043,
|
|
"grad_norm": 0.12241272213645364,
|
|
"learning_rate": 4.045723545159927e-06,
|
|
"loss": 1.2314,
|
|
"mean_token_accuracy": 0.7052522838115692,
|
|
"num_tokens": 290508054.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 1.21953125,
|
|
"epoch": 0.4318709853765204,
|
|
"grad_norm": 0.10422714858237556,
|
|
"learning_rate": 4.04220092997041e-06,
|
|
"loss": 1.2226,
|
|
"mean_token_accuracy": 0.7081830561161041,
|
|
"num_tokens": 291445061.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 1.253125,
|
|
"epoch": 0.43323766570999045,
|
|
"grad_norm": 0.1838700636877357,
|
|
"learning_rate": 4.038678314780894e-06,
|
|
"loss": 1.2426,
|
|
"mean_token_accuracy": 0.7024298250675202,
|
|
"num_tokens": 292415043.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.43460434604346043,
|
|
"grad_norm": 0.12351144694193589,
|
|
"learning_rate": 4.0351556995913775e-06,
|
|
"loss": 1.259,
|
|
"mean_token_accuracy": 0.6991285681724548,
|
|
"num_tokens": 293378677.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 0.4359710263769304,
|
|
"grad_norm": 0.12348669913545798,
|
|
"learning_rate": 4.03163308440186e-06,
|
|
"loss": 1.2611,
|
|
"mean_token_accuracy": 0.6992734491825103,
|
|
"num_tokens": 294289221.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 0.43733770671040045,
|
|
"grad_norm": 0.16485047211102669,
|
|
"learning_rate": 4.028110469212344e-06,
|
|
"loss": 1.2619,
|
|
"mean_token_accuracy": 0.7017032146453858,
|
|
"num_tokens": 295166106.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.43870438704387044,
|
|
"grad_norm": 0.12260547927324919,
|
|
"learning_rate": 4.0245878540228264e-06,
|
|
"loss": 1.2436,
|
|
"mean_token_accuracy": 0.7023661017417908,
|
|
"num_tokens": 296076562.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 1.26796875,
|
|
"epoch": 0.4400710673773404,
|
|
"grad_norm": 0.1827179859892404,
|
|
"learning_rate": 4.02106523883331e-06,
|
|
"loss": 1.2723,
|
|
"mean_token_accuracy": 0.6980110347270966,
|
|
"num_tokens": 296967830.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.44143774771081046,
|
|
"grad_norm": 0.18772536410217897,
|
|
"learning_rate": 4.0175426236437935e-06,
|
|
"loss": 1.2072,
|
|
"mean_token_accuracy": 0.7110700249671936,
|
|
"num_tokens": 297854033.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.44280442804428044,
|
|
"grad_norm": 0.1763982020377319,
|
|
"learning_rate": 4.014020008454277e-06,
|
|
"loss": 1.2313,
|
|
"mean_token_accuracy": 0.7048666715621948,
|
|
"num_tokens": 298775899.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 0.4441711083777504,
|
|
"grad_norm": 0.11189720575641501,
|
|
"learning_rate": 4.01049739326476e-06,
|
|
"loss": 1.211,
|
|
"mean_token_accuracy": 0.7061209440231323,
|
|
"num_tokens": 299644998.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 1.259375,
|
|
"epoch": 0.44553778871122046,
|
|
"grad_norm": 0.12778390359698563,
|
|
"learning_rate": 4.006974778075243e-06,
|
|
"loss": 1.2655,
|
|
"mean_token_accuracy": 0.6993802547454834,
|
|
"num_tokens": 300579588.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 1.2734375,
|
|
"epoch": 0.44690446904469044,
|
|
"grad_norm": 0.1176691294050488,
|
|
"learning_rate": 4.003452162885727e-06,
|
|
"loss": 1.2784,
|
|
"mean_token_accuracy": 0.6925972759723663,
|
|
"num_tokens": 301539377.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.4482711493781604,
|
|
"grad_norm": 0.16950149066222447,
|
|
"learning_rate": 3.99992954769621e-06,
|
|
"loss": 1.2215,
|
|
"mean_token_accuracy": 0.7061489284038543,
|
|
"num_tokens": 302459676.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.44963782971163047,
|
|
"grad_norm": 0.12953329412643733,
|
|
"learning_rate": 3.996406932506693e-06,
|
|
"loss": 1.2366,
|
|
"mean_token_accuracy": 0.7036916553974152,
|
|
"num_tokens": 303424014.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.45100451004510045,
|
|
"grad_norm": 0.13171471359865944,
|
|
"learning_rate": 3.992884317317177e-06,
|
|
"loss": 1.2389,
|
|
"mean_token_accuracy": 0.7038282930850983,
|
|
"num_tokens": 304396334.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.45237119037857043,
|
|
"grad_norm": 0.11901205680387979,
|
|
"learning_rate": 3.98936170212766e-06,
|
|
"loss": 1.2891,
|
|
"mean_token_accuracy": 0.6949994564056396,
|
|
"num_tokens": 305340507.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.45373787071204047,
|
|
"grad_norm": 0.12282075985984615,
|
|
"learning_rate": 3.985839086938143e-06,
|
|
"loss": 1.219,
|
|
"mean_token_accuracy": 0.7053154587745667,
|
|
"num_tokens": 306253804.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.45510455104551045,
|
|
"grad_norm": 0.12960138525498194,
|
|
"learning_rate": 3.9823164717486265e-06,
|
|
"loss": 1.2717,
|
|
"mean_token_accuracy": 0.6964719116687774,
|
|
"num_tokens": 307219655.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 0.45647123137898044,
|
|
"grad_norm": 0.11402304325346672,
|
|
"learning_rate": 3.97879385655911e-06,
|
|
"loss": 1.2613,
|
|
"mean_token_accuracy": 0.6993931233882904,
|
|
"num_tokens": 308120248.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 1.2890625,
|
|
"epoch": 0.4578379117124505,
|
|
"grad_norm": 0.11467088664716353,
|
|
"learning_rate": 3.9752712413695936e-06,
|
|
"loss": 1.2975,
|
|
"mean_token_accuracy": 0.6938745737075805,
|
|
"num_tokens": 309033220.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 1.33515625,
|
|
"epoch": 0.45920459204592046,
|
|
"grad_norm": 0.12185973954693556,
|
|
"learning_rate": 3.971748626180076e-06,
|
|
"loss": 1.3439,
|
|
"mean_token_accuracy": 0.6823151230812072,
|
|
"num_tokens": 309966122.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.46057127237939044,
|
|
"grad_norm": 0.12103033835876384,
|
|
"learning_rate": 3.96822601099056e-06,
|
|
"loss": 1.2572,
|
|
"mean_token_accuracy": 0.6986809134483337,
|
|
"num_tokens": 310927570.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 1.25078125,
|
|
"epoch": 0.4619379527128605,
|
|
"grad_norm": 0.18343949136830298,
|
|
"learning_rate": 3.9647033958010425e-06,
|
|
"loss": 1.2669,
|
|
"mean_token_accuracy": 0.6974805653095245,
|
|
"num_tokens": 311832124.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.46330463304633046,
|
|
"grad_norm": 0.12564993605813216,
|
|
"learning_rate": 3.961180780611527e-06,
|
|
"loss": 1.2633,
|
|
"mean_token_accuracy": 0.69639453291893,
|
|
"num_tokens": 312752210.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 1.2796875,
|
|
"epoch": 0.46467131337980044,
|
|
"grad_norm": 0.14357900478041893,
|
|
"learning_rate": 3.95765816542201e-06,
|
|
"loss": 1.2817,
|
|
"mean_token_accuracy": 0.6976219177246094,
|
|
"num_tokens": 313721040.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 0.4660379937132705,
|
|
"grad_norm": 0.12862902337650894,
|
|
"learning_rate": 3.954135550232493e-06,
|
|
"loss": 1.2281,
|
|
"mean_token_accuracy": 0.7039004743099213,
|
|
"num_tokens": 314623635.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.46740467404674046,
|
|
"grad_norm": 0.12402662061841266,
|
|
"learning_rate": 3.950612935042976e-06,
|
|
"loss": 1.2541,
|
|
"mean_token_accuracy": 0.7014503538608551,
|
|
"num_tokens": 315549717.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.46877135438021045,
|
|
"grad_norm": 0.11272762204308995,
|
|
"learning_rate": 3.947090319853459e-06,
|
|
"loss": 1.227,
|
|
"mean_token_accuracy": 0.7058685302734375,
|
|
"num_tokens": 316459223.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 1.29453125,
|
|
"epoch": 0.4701380347136805,
|
|
"grad_norm": 0.14327405018351555,
|
|
"learning_rate": 3.943567704663943e-06,
|
|
"loss": 1.2926,
|
|
"mean_token_accuracy": 0.6944358944892883,
|
|
"num_tokens": 317359360.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 0.47150471504715047,
|
|
"grad_norm": 0.12473573160684626,
|
|
"learning_rate": 3.9400450894744265e-06,
|
|
"loss": 1.2409,
|
|
"mean_token_accuracy": 0.701851361989975,
|
|
"num_tokens": 318239868.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.47287139538062045,
|
|
"grad_norm": 0.15723517302660947,
|
|
"learning_rate": 3.936522474284909e-06,
|
|
"loss": 1.2915,
|
|
"mean_token_accuracy": 0.694439709186554,
|
|
"num_tokens": 319176882.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 1.240625,
|
|
"epoch": 0.4742380757140905,
|
|
"grad_norm": 0.17661832354712106,
|
|
"learning_rate": 3.932999859095393e-06,
|
|
"loss": 1.2574,
|
|
"mean_token_accuracy": 0.7002659499645233,
|
|
"num_tokens": 320165510.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.4756047560475605,
|
|
"grad_norm": 0.13821431853823743,
|
|
"learning_rate": 3.929477243905876e-06,
|
|
"loss": 1.2238,
|
|
"mean_token_accuracy": 0.7081616759300232,
|
|
"num_tokens": 321088771.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 1.26484375,
|
|
"epoch": 0.47697143638103046,
|
|
"grad_norm": 0.1332489148932578,
|
|
"learning_rate": 3.925954628716359e-06,
|
|
"loss": 1.261,
|
|
"mean_token_accuracy": 0.7019150912761688,
|
|
"num_tokens": 322002114.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.4783381167145005,
|
|
"grad_norm": 0.14209226578187423,
|
|
"learning_rate": 3.9224320135268425e-06,
|
|
"loss": 1.2528,
|
|
"mean_token_accuracy": 0.7005915403366089,
|
|
"num_tokens": 322898306.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 1.26796875,
|
|
"epoch": 0.4797047970479705,
|
|
"grad_norm": 0.13497947020386888,
|
|
"learning_rate": 3.918909398337326e-06,
|
|
"loss": 1.2726,
|
|
"mean_token_accuracy": 0.6967598378658295,
|
|
"num_tokens": 323816623.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.48107147738144046,
|
|
"grad_norm": 0.13532351838577164,
|
|
"learning_rate": 3.91538678314781e-06,
|
|
"loss": 1.2387,
|
|
"mean_token_accuracy": 0.7050524771213531,
|
|
"num_tokens": 324757771.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.4824381577149105,
|
|
"grad_norm": 0.11228715786364492,
|
|
"learning_rate": 3.911864167958292e-06,
|
|
"loss": 1.2694,
|
|
"mean_token_accuracy": 0.6995607554912567,
|
|
"num_tokens": 325695539.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 0.4838048380483805,
|
|
"grad_norm": 0.2047813524246208,
|
|
"learning_rate": 3.908341552768776e-06,
|
|
"loss": 1.2299,
|
|
"mean_token_accuracy": 0.7051625609397888,
|
|
"num_tokens": 326619627.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 0.48517151838185046,
|
|
"grad_norm": 0.13057740714151608,
|
|
"learning_rate": 3.9048189375792586e-06,
|
|
"loss": 1.2208,
|
|
"mean_token_accuracy": 0.7032055616378784,
|
|
"num_tokens": 327522409.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.4865381987153205,
|
|
"grad_norm": 0.11237434110920828,
|
|
"learning_rate": 3.901296322389743e-06,
|
|
"loss": 1.2744,
|
|
"mean_token_accuracy": 0.6952591955661773,
|
|
"num_tokens": 328418711.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.4879048790487905,
|
|
"grad_norm": 0.12032901951240739,
|
|
"learning_rate": 3.897773707200226e-06,
|
|
"loss": 1.2607,
|
|
"mean_token_accuracy": 0.6982460200786591,
|
|
"num_tokens": 329322871.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 1.2546875,
|
|
"epoch": 0.48927155938226047,
|
|
"grad_norm": 0.11871037905476327,
|
|
"learning_rate": 3.894251092010709e-06,
|
|
"loss": 1.2451,
|
|
"mean_token_accuracy": 0.7017015755176544,
|
|
"num_tokens": 330272623.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.4906382397157305,
|
|
"grad_norm": 0.1233019828844048,
|
|
"learning_rate": 3.890728476821192e-06,
|
|
"loss": 1.2031,
|
|
"mean_token_accuracy": 0.7104011476039886,
|
|
"num_tokens": 331160163.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.4920049200492005,
|
|
"grad_norm": 0.1320481062996068,
|
|
"learning_rate": 3.8872058616316755e-06,
|
|
"loss": 1.1701,
|
|
"mean_token_accuracy": 0.7177197217941285,
|
|
"num_tokens": 332041350.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.49337160038267047,
|
|
"grad_norm": 0.11566680445551253,
|
|
"learning_rate": 3.883683246442159e-06,
|
|
"loss": 1.2399,
|
|
"mean_token_accuracy": 0.7029148638248444,
|
|
"num_tokens": 332958062.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.4947382807161405,
|
|
"grad_norm": 0.10725173420369238,
|
|
"learning_rate": 3.8801606312526426e-06,
|
|
"loss": 1.2526,
|
|
"mean_token_accuracy": 0.7017835378646851,
|
|
"num_tokens": 333926373.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 0.4961049610496105,
|
|
"grad_norm": 0.11967044384869208,
|
|
"learning_rate": 3.876638016063125e-06,
|
|
"loss": 1.1793,
|
|
"mean_token_accuracy": 0.7160211622714996,
|
|
"num_tokens": 334867149.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.4974716413830805,
|
|
"grad_norm": 0.12485125693424547,
|
|
"learning_rate": 3.873115400873609e-06,
|
|
"loss": 1.2127,
|
|
"mean_token_accuracy": 0.7084379136562348,
|
|
"num_tokens": 335748006.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.4988383217165505,
|
|
"grad_norm": 0.11240088665099021,
|
|
"learning_rate": 3.869592785684092e-06,
|
|
"loss": 1.2781,
|
|
"mean_token_accuracy": 0.6980305790901185,
|
|
"num_tokens": 336676648.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.5002050020500205,
|
|
"grad_norm": 0.10835299983925814,
|
|
"learning_rate": 3.866070170494575e-06,
|
|
"loss": 1.2343,
|
|
"mean_token_accuracy": 0.7063319981098175,
|
|
"num_tokens": 337574385.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 1.240625,
|
|
"epoch": 0.5015716823834905,
|
|
"grad_norm": 0.23624800659169648,
|
|
"learning_rate": 3.862547555305059e-06,
|
|
"loss": 1.2588,
|
|
"mean_token_accuracy": 0.700107729434967,
|
|
"num_tokens": 338532100.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.5029383627169605,
|
|
"grad_norm": 0.1501346723674562,
|
|
"learning_rate": 3.859024940115542e-06,
|
|
"loss": 1.2658,
|
|
"mean_token_accuracy": 0.6982304275035858,
|
|
"num_tokens": 339464473.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.5043050430504306,
|
|
"grad_norm": 0.11401744439373263,
|
|
"learning_rate": 3.855502324926026e-06,
|
|
"loss": 1.2048,
|
|
"mean_token_accuracy": 0.7127435684204102,
|
|
"num_tokens": 340432340.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.5056717233839005,
|
|
"grad_norm": 0.11249089974750306,
|
|
"learning_rate": 3.851979709736508e-06,
|
|
"loss": 1.2515,
|
|
"mean_token_accuracy": 0.7041179597377777,
|
|
"num_tokens": 341335395.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 0.5070384037173705,
|
|
"grad_norm": 0.2402200575443088,
|
|
"learning_rate": 3.848457094546992e-06,
|
|
"loss": 1.2437,
|
|
"mean_token_accuracy": 0.7033019483089447,
|
|
"num_tokens": 342247450.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 0.5084050840508405,
|
|
"grad_norm": 0.11178998136718586,
|
|
"learning_rate": 3.8449344793574755e-06,
|
|
"loss": 1.2217,
|
|
"mean_token_accuracy": 0.7073119938373565,
|
|
"num_tokens": 343198674.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 1.24765625,
|
|
"epoch": 0.5097717643843105,
|
|
"grad_norm": 0.111358280614814,
|
|
"learning_rate": 3.841411864167959e-06,
|
|
"loss": 1.2504,
|
|
"mean_token_accuracy": 0.7009368121623993,
|
|
"num_tokens": 344106894.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.5111384447177805,
|
|
"grad_norm": 0.1526106093379976,
|
|
"learning_rate": 3.837889248978442e-06,
|
|
"loss": 1.2591,
|
|
"mean_token_accuracy": 0.6991377711296082,
|
|
"num_tokens": 345010494.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 0.5125051250512506,
|
|
"grad_norm": 0.13693699675400622,
|
|
"learning_rate": 3.834366633788925e-06,
|
|
"loss": 1.2401,
|
|
"mean_token_accuracy": 0.7043694138526917,
|
|
"num_tokens": 345934020.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 0.5138718053847205,
|
|
"grad_norm": 0.1472881009324309,
|
|
"learning_rate": 3.830844018599408e-06,
|
|
"loss": 1.2563,
|
|
"mean_token_accuracy": 0.7000455021858215,
|
|
"num_tokens": 346799355.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 1.26015625,
|
|
"epoch": 0.5152384857181905,
|
|
"grad_norm": 0.1148725386884202,
|
|
"learning_rate": 3.8273214034098915e-06,
|
|
"loss": 1.2593,
|
|
"mean_token_accuracy": 0.7013286054134369,
|
|
"num_tokens": 347721206.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 1.27265625,
|
|
"epoch": 0.5166051660516605,
|
|
"grad_norm": 0.13892061352297083,
|
|
"learning_rate": 3.823798788220375e-06,
|
|
"loss": 1.2752,
|
|
"mean_token_accuracy": 0.6952713668346405,
|
|
"num_tokens": 348616795.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.5179718463851305,
|
|
"grad_norm": 0.1275087195696206,
|
|
"learning_rate": 3.820276173030859e-06,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7078777134418488,
|
|
"num_tokens": 349508928.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.5193385267186005,
|
|
"grad_norm": 0.140190863416897,
|
|
"learning_rate": 3.816753557841341e-06,
|
|
"loss": 1.2467,
|
|
"mean_token_accuracy": 0.7020524680614472,
|
|
"num_tokens": 350404276.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.5207052070520706,
|
|
"grad_norm": 0.1146849397155704,
|
|
"learning_rate": 3.813230942651825e-06,
|
|
"loss": 1.2261,
|
|
"mean_token_accuracy": 0.7062445700168609,
|
|
"num_tokens": 351273919.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 1.2453125,
|
|
"epoch": 0.5220718873855406,
|
|
"grad_norm": 0.12293278074844904,
|
|
"learning_rate": 3.809708327462308e-06,
|
|
"loss": 1.248,
|
|
"mean_token_accuracy": 0.7010807693004608,
|
|
"num_tokens": 352170178.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 1.2765625,
|
|
"epoch": 0.5234385677190105,
|
|
"grad_norm": 0.12249648123381536,
|
|
"learning_rate": 3.806185712272792e-06,
|
|
"loss": 1.2849,
|
|
"mean_token_accuracy": 0.6972138941287994,
|
|
"num_tokens": 353108363.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 1.29921875,
|
|
"epoch": 0.5248052480524805,
|
|
"grad_norm": 0.11358921007705441,
|
|
"learning_rate": 3.802663097083275e-06,
|
|
"loss": 1.3163,
|
|
"mean_token_accuracy": 0.6894097208976746,
|
|
"num_tokens": 353972875.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 1.25859375,
|
|
"epoch": 0.5261719283859505,
|
|
"grad_norm": 0.13347390573555323,
|
|
"learning_rate": 3.799140481893758e-06,
|
|
"loss": 1.2544,
|
|
"mean_token_accuracy": 0.7015778541564941,
|
|
"num_tokens": 354864325.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.5275386087194205,
|
|
"grad_norm": 0.13997432869292936,
|
|
"learning_rate": 3.7956178667042413e-06,
|
|
"loss": 1.2279,
|
|
"mean_token_accuracy": 0.7086558878421784,
|
|
"num_tokens": 355743210.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.5289052890528906,
|
|
"grad_norm": 0.11250181995575796,
|
|
"learning_rate": 3.792095251514725e-06,
|
|
"loss": 1.1955,
|
|
"mean_token_accuracy": 0.7097419917583465,
|
|
"num_tokens": 356680442.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.5302719693863606,
|
|
"grad_norm": 0.11384821024074558,
|
|
"learning_rate": 3.788572636325208e-06,
|
|
"loss": 1.2375,
|
|
"mean_token_accuracy": 0.7062253832817078,
|
|
"num_tokens": 357633513.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.5316386497198305,
|
|
"grad_norm": 0.11108786426857967,
|
|
"learning_rate": 3.7850500211356916e-06,
|
|
"loss": 1.1947,
|
|
"mean_token_accuracy": 0.7110786616802216,
|
|
"num_tokens": 358560575.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 1.25625,
|
|
"epoch": 0.5330053300533005,
|
|
"grad_norm": 0.11609254070265762,
|
|
"learning_rate": 3.7815274059461747e-06,
|
|
"loss": 1.2659,
|
|
"mean_token_accuracy": 0.6988455653190613,
|
|
"num_tokens": 359474308.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.5343720103867705,
|
|
"grad_norm": 0.10888454406896787,
|
|
"learning_rate": 3.7780047907566582e-06,
|
|
"loss": 1.2518,
|
|
"mean_token_accuracy": 0.7004968166351319,
|
|
"num_tokens": 360411784.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.5357386907202405,
|
|
"grad_norm": 0.11862026989563704,
|
|
"learning_rate": 3.7744821755671413e-06,
|
|
"loss": 1.2414,
|
|
"mean_token_accuracy": 0.7063008785247803,
|
|
"num_tokens": 361350524.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.5371053710537106,
|
|
"grad_norm": 0.11164846782683206,
|
|
"learning_rate": 3.7709595603776245e-06,
|
|
"loss": 1.2924,
|
|
"mean_token_accuracy": 0.6948120534420014,
|
|
"num_tokens": 362283058.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.5384720513871806,
|
|
"grad_norm": 0.12887607582460717,
|
|
"learning_rate": 3.767436945188108e-06,
|
|
"loss": 1.2613,
|
|
"mean_token_accuracy": 0.698154616355896,
|
|
"num_tokens": 363194527.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.5398387317206506,
|
|
"grad_norm": 0.12934509811616365,
|
|
"learning_rate": 3.7639143299985916e-06,
|
|
"loss": 1.2196,
|
|
"mean_token_accuracy": 0.7077894926071167,
|
|
"num_tokens": 364108504.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 1.2578125,
|
|
"epoch": 0.5412054120541205,
|
|
"grad_norm": 0.17112673595378783,
|
|
"learning_rate": 3.7603917148090747e-06,
|
|
"loss": 1.2706,
|
|
"mean_token_accuracy": 0.6989914298057556,
|
|
"num_tokens": 365029878.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.5425720923875905,
|
|
"grad_norm": 0.11704754926571964,
|
|
"learning_rate": 3.756869099619558e-06,
|
|
"loss": 1.2145,
|
|
"mean_token_accuracy": 0.7098171234130859,
|
|
"num_tokens": 365964480.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.5439387727210605,
|
|
"grad_norm": 0.12009077908663605,
|
|
"learning_rate": 3.753346484430041e-06,
|
|
"loss": 1.2174,
|
|
"mean_token_accuracy": 0.7075273215770721,
|
|
"num_tokens": 366872780.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.5453054530545306,
|
|
"grad_norm": 0.12385766066838488,
|
|
"learning_rate": 3.749823869240524e-06,
|
|
"loss": 1.2027,
|
|
"mean_token_accuracy": 0.7113142311573029,
|
|
"num_tokens": 367776522.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 1.195703125,
|
|
"epoch": 0.5466721333880006,
|
|
"grad_norm": 0.11683789233424546,
|
|
"learning_rate": 3.746301254051008e-06,
|
|
"loss": 1.1987,
|
|
"mean_token_accuracy": 0.7109182476997375,
|
|
"num_tokens": 368687371.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 0.5480388137214706,
|
|
"grad_norm": 0.1140731419354509,
|
|
"learning_rate": 3.742778638861491e-06,
|
|
"loss": 1.1953,
|
|
"mean_token_accuracy": 0.7108234703540802,
|
|
"num_tokens": 369596596.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 1.240625,
|
|
"epoch": 0.5494054940549405,
|
|
"grad_norm": 0.11458256438144522,
|
|
"learning_rate": 3.7392560236719743e-06,
|
|
"loss": 1.2421,
|
|
"mean_token_accuracy": 0.7022044777870178,
|
|
"num_tokens": 370474928.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.5507721743884105,
|
|
"grad_norm": 0.12276729784913244,
|
|
"learning_rate": 3.7357334084824574e-06,
|
|
"loss": 1.1876,
|
|
"mean_token_accuracy": 0.7145244717597962,
|
|
"num_tokens": 371384674.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.5521388547218805,
|
|
"grad_norm": 0.13917538823924414,
|
|
"learning_rate": 3.732210793292941e-06,
|
|
"loss": 1.1979,
|
|
"mean_token_accuracy": 0.7105246603488922,
|
|
"num_tokens": 372309541.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 1.275,
|
|
"epoch": 0.5535055350553506,
|
|
"grad_norm": 0.12056504225645434,
|
|
"learning_rate": 3.7286881781034245e-06,
|
|
"loss": 1.2801,
|
|
"mean_token_accuracy": 0.6950753152370452,
|
|
"num_tokens": 373247679.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.5548722153888206,
|
|
"grad_norm": 0.1544883378344909,
|
|
"learning_rate": 3.7251655629139076e-06,
|
|
"loss": 1.2348,
|
|
"mean_token_accuracy": 0.705150443315506,
|
|
"num_tokens": 374199142.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 1.2453125,
|
|
"epoch": 0.5562388957222906,
|
|
"grad_norm": 0.11631559182711568,
|
|
"learning_rate": 3.7216429477243907e-06,
|
|
"loss": 1.2406,
|
|
"mean_token_accuracy": 0.7014714360237122,
|
|
"num_tokens": 375123594.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.5576055760557606,
|
|
"grad_norm": 0.1387535941719255,
|
|
"learning_rate": 3.7181203325348743e-06,
|
|
"loss": 1.2335,
|
|
"mean_token_accuracy": 0.7051595449447632,
|
|
"num_tokens": 376085352.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 1.28671875,
|
|
"epoch": 0.5589722563892305,
|
|
"grad_norm": 0.21485299284753123,
|
|
"learning_rate": 3.7145977173453574e-06,
|
|
"loss": 1.2853,
|
|
"mean_token_accuracy": 0.6939905941486358,
|
|
"num_tokens": 376981305.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.5603389367227005,
|
|
"grad_norm": 0.12744857357616912,
|
|
"learning_rate": 3.7110751021558405e-06,
|
|
"loss": 1.2159,
|
|
"mean_token_accuracy": 0.706365704536438,
|
|
"num_tokens": 377879583.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.5617056170561706,
|
|
"grad_norm": 0.11622385064142135,
|
|
"learning_rate": 3.7075524869663245e-06,
|
|
"loss": 1.2199,
|
|
"mean_token_accuracy": 0.7055269181728363,
|
|
"num_tokens": 378832645.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.5630722973896406,
|
|
"grad_norm": 0.12202388185882163,
|
|
"learning_rate": 3.7040298717768076e-06,
|
|
"loss": 1.2177,
|
|
"mean_token_accuracy": 0.7082817852497101,
|
|
"num_tokens": 379779952.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 1.2578125,
|
|
"epoch": 0.5644389777231106,
|
|
"grad_norm": 0.12864990402944457,
|
|
"learning_rate": 3.7005072565872908e-06,
|
|
"loss": 1.2507,
|
|
"mean_token_accuracy": 0.700800359249115,
|
|
"num_tokens": 380737380.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.5658056580565806,
|
|
"grad_norm": 0.12592051179365316,
|
|
"learning_rate": 3.696984641397774e-06,
|
|
"loss": 1.2412,
|
|
"mean_token_accuracy": 0.7029470920562744,
|
|
"num_tokens": 381656151.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 0.5671723383900505,
|
|
"grad_norm": 0.28775242742818613,
|
|
"learning_rate": 3.693462026208257e-06,
|
|
"loss": 1.1948,
|
|
"mean_token_accuracy": 0.7117050290107727,
|
|
"num_tokens": 382571650.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.5685390187235205,
|
|
"grad_norm": 0.1107019156249638,
|
|
"learning_rate": 3.689939411018741e-06,
|
|
"loss": 1.2599,
|
|
"mean_token_accuracy": 0.7026832282543183,
|
|
"num_tokens": 383530667.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.5699056990569906,
|
|
"grad_norm": 0.16649642462489805,
|
|
"learning_rate": 3.686416795829224e-06,
|
|
"loss": 1.1645,
|
|
"mean_token_accuracy": 0.7162833929061889,
|
|
"num_tokens": 384438746.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 1.21953125,
|
|
"epoch": 0.5712723793904606,
|
|
"grad_norm": 0.14718646181027972,
|
|
"learning_rate": 3.6828941806397072e-06,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.7068425595760346,
|
|
"num_tokens": 385325961.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 0.5726390597239306,
|
|
"grad_norm": 0.15513614124390127,
|
|
"learning_rate": 3.6793715654501903e-06,
|
|
"loss": 1.227,
|
|
"mean_token_accuracy": 0.7050190985202789,
|
|
"num_tokens": 386264764.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.5740057400574006,
|
|
"grad_norm": 0.1296550268410452,
|
|
"learning_rate": 3.6758489502606735e-06,
|
|
"loss": 1.2426,
|
|
"mean_token_accuracy": 0.7043660819530487,
|
|
"num_tokens": 387231562.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.5753724203908706,
|
|
"grad_norm": 0.12550778555012287,
|
|
"learning_rate": 3.672326335071157e-06,
|
|
"loss": 1.228,
|
|
"mean_token_accuracy": 0.705913120508194,
|
|
"num_tokens": 388124752.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.5767391007243405,
|
|
"grad_norm": 0.11275829034487639,
|
|
"learning_rate": 3.6688037198816406e-06,
|
|
"loss": 1.239,
|
|
"mean_token_accuracy": 0.7053440690040589,
|
|
"num_tokens": 389034093.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.5781057810578106,
|
|
"grad_norm": 0.12846094754604664,
|
|
"learning_rate": 3.6652811046921237e-06,
|
|
"loss": 1.218,
|
|
"mean_token_accuracy": 0.7079543530941009,
|
|
"num_tokens": 389920605.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.5794724613912806,
|
|
"grad_norm": 0.10743317757463652,
|
|
"learning_rate": 3.6617584895026072e-06,
|
|
"loss": 1.2023,
|
|
"mean_token_accuracy": 0.7102604329586029,
|
|
"num_tokens": 390802728.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.5808391417247506,
|
|
"grad_norm": 0.12249471794225775,
|
|
"learning_rate": 3.6582358743130904e-06,
|
|
"loss": 1.2657,
|
|
"mean_token_accuracy": 0.6961081147193908,
|
|
"num_tokens": 391735027.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.5822058220582206,
|
|
"grad_norm": 0.1284434697040564,
|
|
"learning_rate": 3.6547132591235735e-06,
|
|
"loss": 1.2285,
|
|
"mean_token_accuracy": 0.7039616107940674,
|
|
"num_tokens": 392621164.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.5835725023916906,
|
|
"grad_norm": 0.13161285131818717,
|
|
"learning_rate": 3.651190643934057e-06,
|
|
"loss": 1.2375,
|
|
"mean_token_accuracy": 0.7056927680969238,
|
|
"num_tokens": 393474315.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 0.5849391827251605,
|
|
"grad_norm": 0.10485610090061516,
|
|
"learning_rate": 3.6476680287445406e-06,
|
|
"loss": 1.2466,
|
|
"mean_token_accuracy": 0.7039802372455597,
|
|
"num_tokens": 394416453.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.5863058630586306,
|
|
"grad_norm": 0.13401974694362126,
|
|
"learning_rate": 3.6441454135550237e-06,
|
|
"loss": 1.192,
|
|
"mean_token_accuracy": 0.7144893109798431,
|
|
"num_tokens": 395353788.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.5876725433921006,
|
|
"grad_norm": 0.11310119790815837,
|
|
"learning_rate": 3.640622798365507e-06,
|
|
"loss": 1.2383,
|
|
"mean_token_accuracy": 0.7017998814582824,
|
|
"num_tokens": 396277079.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 1.2703125,
|
|
"epoch": 0.5890392237255706,
|
|
"grad_norm": 0.10834471711408972,
|
|
"learning_rate": 3.63710018317599e-06,
|
|
"loss": 1.2667,
|
|
"mean_token_accuracy": 0.6969524681568146,
|
|
"num_tokens": 397204872.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 1.2953125,
|
|
"epoch": 0.5904059040590406,
|
|
"grad_norm": 0.1393845242453168,
|
|
"learning_rate": 3.633577567986473e-06,
|
|
"loss": 1.3077,
|
|
"mean_token_accuracy": 0.6929902076721192,
|
|
"num_tokens": 398106709.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.5917725843925106,
|
|
"grad_norm": 0.1334706804806068,
|
|
"learning_rate": 3.630054952796957e-06,
|
|
"loss": 1.2276,
|
|
"mean_token_accuracy": 0.7076478898525238,
|
|
"num_tokens": 399003959.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 1.271875,
|
|
"epoch": 0.5931392647259806,
|
|
"grad_norm": 0.11383481815764687,
|
|
"learning_rate": 3.62653233760744e-06,
|
|
"loss": 1.2678,
|
|
"mean_token_accuracy": 0.6978739500045776,
|
|
"num_tokens": 399928261.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.5945059450594506,
|
|
"grad_norm": 0.13773545708294493,
|
|
"learning_rate": 3.6230097224179233e-06,
|
|
"loss": 1.2118,
|
|
"mean_token_accuracy": 0.7088652670383453,
|
|
"num_tokens": 400734644.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.5958726253929206,
|
|
"grad_norm": 0.12160076117932794,
|
|
"learning_rate": 3.6194871072284064e-06,
|
|
"loss": 1.237,
|
|
"mean_token_accuracy": 0.7054871797561646,
|
|
"num_tokens": 401677248.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.5972393057263906,
|
|
"grad_norm": 0.13660048708814865,
|
|
"learning_rate": 3.61596449203889e-06,
|
|
"loss": 1.2382,
|
|
"mean_token_accuracy": 0.7049317002296448,
|
|
"num_tokens": 402589892.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.5986059860598606,
|
|
"grad_norm": 0.11777961832382056,
|
|
"learning_rate": 3.6124418768493735e-06,
|
|
"loss": 1.1758,
|
|
"mean_token_accuracy": 0.7156641840934753,
|
|
"num_tokens": 403493521.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 1.2640625,
|
|
"epoch": 0.5999726663933306,
|
|
"grad_norm": 0.11372971580548387,
|
|
"learning_rate": 3.6089192616598566e-06,
|
|
"loss": 1.2797,
|
|
"mean_token_accuracy": 0.6975837230682373,
|
|
"num_tokens": 404481917.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 0.6013393467268006,
|
|
"grad_norm": 0.12929972530967948,
|
|
"learning_rate": 3.6053966464703398e-06,
|
|
"loss": 1.2672,
|
|
"mean_token_accuracy": 0.6991229474544525,
|
|
"num_tokens": 405381103.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 1.27109375,
|
|
"epoch": 0.6027060270602707,
|
|
"grad_norm": 0.11386063059167346,
|
|
"learning_rate": 3.6018740312808233e-06,
|
|
"loss": 1.2821,
|
|
"mean_token_accuracy": 0.6944692611694336,
|
|
"num_tokens": 406360551.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.6040727073937406,
|
|
"grad_norm": 0.12472846192193926,
|
|
"learning_rate": 3.5983514160913064e-06,
|
|
"loss": 1.2369,
|
|
"mean_token_accuracy": 0.700896394252777,
|
|
"num_tokens": 407306942.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.6054393877272106,
|
|
"grad_norm": 0.11204937893530698,
|
|
"learning_rate": 3.5948288009017895e-06,
|
|
"loss": 1.2301,
|
|
"mean_token_accuracy": 0.704315322637558,
|
|
"num_tokens": 408238455.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 1.24140625,
|
|
"epoch": 0.6068060680606806,
|
|
"grad_norm": 0.12488333653907964,
|
|
"learning_rate": 3.591306185712273e-06,
|
|
"loss": 1.2382,
|
|
"mean_token_accuracy": 0.703354787826538,
|
|
"num_tokens": 409152396.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 1.28125,
|
|
"epoch": 0.6081727483941506,
|
|
"grad_norm": 0.10690483217723208,
|
|
"learning_rate": 3.5877835705227566e-06,
|
|
"loss": 1.2761,
|
|
"mean_token_accuracy": 0.6967222213745117,
|
|
"num_tokens": 410053504.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.6095394287276206,
|
|
"grad_norm": 0.14038386471183958,
|
|
"learning_rate": 3.5842609553332398e-06,
|
|
"loss": 1.2179,
|
|
"mean_token_accuracy": 0.708868020772934,
|
|
"num_tokens": 410972109.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.6109061090610907,
|
|
"grad_norm": 0.12230976580598579,
|
|
"learning_rate": 3.580738340143723e-06,
|
|
"loss": 1.2599,
|
|
"mean_token_accuracy": 0.699013888835907,
|
|
"num_tokens": 411889574.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.6122727893945606,
|
|
"grad_norm": 0.21080143820123828,
|
|
"learning_rate": 3.577215724954206e-06,
|
|
"loss": 1.2233,
|
|
"mean_token_accuracy": 0.7066199600696563,
|
|
"num_tokens": 412842473.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.6136394697280306,
|
|
"grad_norm": 0.1049712419641927,
|
|
"learning_rate": 3.57369310976469e-06,
|
|
"loss": 1.2503,
|
|
"mean_token_accuracy": 0.698589950799942,
|
|
"num_tokens": 413720887.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.6150061500615006,
|
|
"grad_norm": 0.19074424241802312,
|
|
"learning_rate": 3.570170494575173e-06,
|
|
"loss": 1.2671,
|
|
"mean_token_accuracy": 0.7033799767494202,
|
|
"num_tokens": 414663481.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 1.2984375,
|
|
"epoch": 0.6163728303949706,
|
|
"grad_norm": 0.11978344502930567,
|
|
"learning_rate": 3.5666478793856562e-06,
|
|
"loss": 1.3183,
|
|
"mean_token_accuracy": 0.688888818025589,
|
|
"num_tokens": 415566128.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.6177395107284406,
|
|
"grad_norm": 0.12937138576090124,
|
|
"learning_rate": 3.5631252641961394e-06,
|
|
"loss": 1.2211,
|
|
"mean_token_accuracy": 0.7067637085914612,
|
|
"num_tokens": 416455167.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 0.6191061910619107,
|
|
"grad_norm": 0.11872326975376135,
|
|
"learning_rate": 3.5596026490066225e-06,
|
|
"loss": 1.1868,
|
|
"mean_token_accuracy": 0.7123356521129608,
|
|
"num_tokens": 417403231.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 0.6204728713953807,
|
|
"grad_norm": 0.11248272396028247,
|
|
"learning_rate": 3.556080033817106e-06,
|
|
"loss": 1.2276,
|
|
"mean_token_accuracy": 0.7068951964378357,
|
|
"num_tokens": 418291523.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 1.2703125,
|
|
"epoch": 0.6218395517288506,
|
|
"grad_norm": 0.12011763281801546,
|
|
"learning_rate": 3.5525574186275896e-06,
|
|
"loss": 1.2757,
|
|
"mean_token_accuracy": 0.6965153634548187,
|
|
"num_tokens": 419177494.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 1.26328125,
|
|
"epoch": 0.6232062320623206,
|
|
"grad_norm": 0.11770588250994701,
|
|
"learning_rate": 3.5490348034380727e-06,
|
|
"loss": 1.2775,
|
|
"mean_token_accuracy": 0.6968955516815185,
|
|
"num_tokens": 420118197.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 1.26015625,
|
|
"epoch": 0.6245729123957906,
|
|
"grad_norm": 0.1313326105196626,
|
|
"learning_rate": 3.545512188248556e-06,
|
|
"loss": 1.2599,
|
|
"mean_token_accuracy": 0.698677521944046,
|
|
"num_tokens": 421017126.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.6259395927292606,
|
|
"grad_norm": 0.154059091763518,
|
|
"learning_rate": 3.5419895730590394e-06,
|
|
"loss": 1.21,
|
|
"mean_token_accuracy": 0.7071102917194366,
|
|
"num_tokens": 421902299.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.6273062730627307,
|
|
"grad_norm": 0.16599674325063313,
|
|
"learning_rate": 3.5384669578695225e-06,
|
|
"loss": 1.2647,
|
|
"mean_token_accuracy": 0.6996862113475799,
|
|
"num_tokens": 422810394.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 1.265625,
|
|
"epoch": 0.6286729533962007,
|
|
"grad_norm": 0.12241068553174494,
|
|
"learning_rate": 3.534944342680006e-06,
|
|
"loss": 1.2611,
|
|
"mean_token_accuracy": 0.7002546787261963,
|
|
"num_tokens": 423762943.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 0.6300396337296706,
|
|
"grad_norm": 0.13520383184071755,
|
|
"learning_rate": 3.5314217274904896e-06,
|
|
"loss": 1.2051,
|
|
"mean_token_accuracy": 0.7088696420192718,
|
|
"num_tokens": 424688079.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.6314063140631406,
|
|
"grad_norm": 0.12789908549329385,
|
|
"learning_rate": 3.5278991123009727e-06,
|
|
"loss": 1.2536,
|
|
"mean_token_accuracy": 0.7022966384887696,
|
|
"num_tokens": 425593816.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 1.246875,
|
|
"epoch": 0.6327729943966106,
|
|
"grad_norm": 0.18614024870028517,
|
|
"learning_rate": 3.524376497111456e-06,
|
|
"loss": 1.2534,
|
|
"mean_token_accuracy": 0.70036079287529,
|
|
"num_tokens": 426535659.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.6341396747300806,
|
|
"grad_norm": 0.17623320075372456,
|
|
"learning_rate": 3.520853881921939e-06,
|
|
"loss": 1.2368,
|
|
"mean_token_accuracy": 0.7041243433952331,
|
|
"num_tokens": 427448468.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.6355063550635507,
|
|
"grad_norm": 0.13072374979033305,
|
|
"learning_rate": 3.517331266732422e-06,
|
|
"loss": 1.2522,
|
|
"mean_token_accuracy": 0.7023147165775299,
|
|
"num_tokens": 428404261.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 1.28828125,
|
|
"epoch": 0.6368730353970207,
|
|
"grad_norm": 0.1308227705876066,
|
|
"learning_rate": 3.513808651542906e-06,
|
|
"loss": 1.2999,
|
|
"mean_token_accuracy": 0.6949020087718963,
|
|
"num_tokens": 429314923.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 0.6382397157304907,
|
|
"grad_norm": 0.2593199962929479,
|
|
"learning_rate": 3.510286036353389e-06,
|
|
"loss": 1.21,
|
|
"mean_token_accuracy": 0.7088545083999633,
|
|
"num_tokens": 430177549.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.6396063960639606,
|
|
"grad_norm": 0.10724152264901894,
|
|
"learning_rate": 3.5067634211638723e-06,
|
|
"loss": 1.2444,
|
|
"mean_token_accuracy": 0.7032991290092468,
|
|
"num_tokens": 431135284.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.6409730763974306,
|
|
"grad_norm": 0.10653265741284774,
|
|
"learning_rate": 3.5032408059743554e-06,
|
|
"loss": 1.2136,
|
|
"mean_token_accuracy": 0.7077821373939515,
|
|
"num_tokens": 432065066.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.6423397567309006,
|
|
"grad_norm": 0.12283107563363568,
|
|
"learning_rate": 3.499718190784839e-06,
|
|
"loss": 1.2327,
|
|
"mean_token_accuracy": 0.7040711641311646,
|
|
"num_tokens": 432943463.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.6437064370643707,
|
|
"grad_norm": 0.11451424041451594,
|
|
"learning_rate": 3.4961955755953225e-06,
|
|
"loss": 1.236,
|
|
"mean_token_accuracy": 0.7033401429653168,
|
|
"num_tokens": 433869594.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.6450731173978407,
|
|
"grad_norm": 0.12333976069072813,
|
|
"learning_rate": 3.4926729604058056e-06,
|
|
"loss": 1.2657,
|
|
"mean_token_accuracy": 0.6986251175403595,
|
|
"num_tokens": 434779798.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.6464397977313107,
|
|
"grad_norm": 0.12703151531767534,
|
|
"learning_rate": 3.4891503452162888e-06,
|
|
"loss": 1.2149,
|
|
"mean_token_accuracy": 0.7095134794712067,
|
|
"num_tokens": 435652803.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.6478064780647806,
|
|
"grad_norm": 0.2007606443743229,
|
|
"learning_rate": 3.4856277300267723e-06,
|
|
"loss": 1.2025,
|
|
"mean_token_accuracy": 0.7102080225944519,
|
|
"num_tokens": 436545226.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.6491731583982506,
|
|
"grad_norm": 0.1454457629710975,
|
|
"learning_rate": 3.4821051148372554e-06,
|
|
"loss": 1.2187,
|
|
"mean_token_accuracy": 0.7070274412631988,
|
|
"num_tokens": 437438317.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.6505398387317206,
|
|
"grad_norm": 0.15603636293694187,
|
|
"learning_rate": 3.4785824996477386e-06,
|
|
"loss": 1.1765,
|
|
"mean_token_accuracy": 0.7153091609477997,
|
|
"num_tokens": 438316880.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 0.6519065190651907,
|
|
"grad_norm": 0.10970372993422492,
|
|
"learning_rate": 3.475059884458222e-06,
|
|
"loss": 1.1658,
|
|
"mean_token_accuracy": 0.7168197572231293,
|
|
"num_tokens": 439224474.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.6532731993986607,
|
|
"grad_norm": 0.11511846504056548,
|
|
"learning_rate": 3.4715372692687057e-06,
|
|
"loss": 1.1882,
|
|
"mean_token_accuracy": 0.7122540235519409,
|
|
"num_tokens": 440100963.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.6546398797321307,
|
|
"grad_norm": 0.11383308697365951,
|
|
"learning_rate": 3.4680146540791888e-06,
|
|
"loss": 1.2599,
|
|
"mean_token_accuracy": 0.7000897765159607,
|
|
"num_tokens": 441037518.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.6560065600656007,
|
|
"grad_norm": 0.14690448797400688,
|
|
"learning_rate": 3.464492038889672e-06,
|
|
"loss": 1.2294,
|
|
"mean_token_accuracy": 0.706351774930954,
|
|
"num_tokens": 441908052.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.6573732403990706,
|
|
"grad_norm": 0.11304228539731952,
|
|
"learning_rate": 3.460969423700155e-06,
|
|
"loss": 1.237,
|
|
"mean_token_accuracy": 0.7019992053508759,
|
|
"num_tokens": 442877731.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.6587399207325406,
|
|
"grad_norm": 0.10900739783208595,
|
|
"learning_rate": 3.457446808510639e-06,
|
|
"loss": 1.2262,
|
|
"mean_token_accuracy": 0.7067185640335083,
|
|
"num_tokens": 443815286.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.6601066010660107,
|
|
"grad_norm": 0.11535356189834278,
|
|
"learning_rate": 3.453924193321122e-06,
|
|
"loss": 1.2349,
|
|
"mean_token_accuracy": 0.7039661467075348,
|
|
"num_tokens": 444773427.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.6614732813994807,
|
|
"grad_norm": 0.12245495215688683,
|
|
"learning_rate": 3.4504015781316052e-06,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.712764710187912,
|
|
"num_tokens": 445718805.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 1.2609375,
|
|
"epoch": 0.6628399617329507,
|
|
"grad_norm": 0.1151253879763085,
|
|
"learning_rate": 3.4468789629420884e-06,
|
|
"loss": 1.2742,
|
|
"mean_token_accuracy": 0.6960287451744079,
|
|
"num_tokens": 446702383.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 0.6642066420664207,
|
|
"grad_norm": 0.1205628228625237,
|
|
"learning_rate": 3.4433563477525715e-06,
|
|
"loss": 1.2004,
|
|
"mean_token_accuracy": 0.7124032557010651,
|
|
"num_tokens": 447616023.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.6655733223998906,
|
|
"grad_norm": 0.120217276704032,
|
|
"learning_rate": 3.439833732563055e-06,
|
|
"loss": 1.2305,
|
|
"mean_token_accuracy": 0.7063743710517884,
|
|
"num_tokens": 448525850.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 0.6669400027333606,
|
|
"grad_norm": 0.12369916132062271,
|
|
"learning_rate": 3.4363111173735386e-06,
|
|
"loss": 1.2424,
|
|
"mean_token_accuracy": 0.7029082834720611,
|
|
"num_tokens": 449488499.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.6683066830668307,
|
|
"grad_norm": 0.1175390038131874,
|
|
"learning_rate": 3.4327885021840217e-06,
|
|
"loss": 1.2383,
|
|
"mean_token_accuracy": 0.7044375598430633,
|
|
"num_tokens": 450427887.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.6696733634003007,
|
|
"grad_norm": 0.12427217056248824,
|
|
"learning_rate": 3.429265886994505e-06,
|
|
"loss": 1.2498,
|
|
"mean_token_accuracy": 0.7021997630596161,
|
|
"num_tokens": 451388079.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.6710400437337707,
|
|
"grad_norm": 0.11516162502809693,
|
|
"learning_rate": 3.4257432718049884e-06,
|
|
"loss": 1.2153,
|
|
"mean_token_accuracy": 0.7076626658439636,
|
|
"num_tokens": 452325548.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 1.21953125,
|
|
"epoch": 0.6724067240672407,
|
|
"grad_norm": 0.1315236673752748,
|
|
"learning_rate": 3.4222206566154715e-06,
|
|
"loss": 1.2385,
|
|
"mean_token_accuracy": 0.7059339284896851,
|
|
"num_tokens": 453283517.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 0.6737734044007107,
|
|
"grad_norm": 0.11582420850505666,
|
|
"learning_rate": 3.418698041425955e-06,
|
|
"loss": 1.2543,
|
|
"mean_token_accuracy": 0.7001114428043366,
|
|
"num_tokens": 454196973.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 0.6751400847341806,
|
|
"grad_norm": 0.1265575169073209,
|
|
"learning_rate": 3.415175426236438e-06,
|
|
"loss": 1.1953,
|
|
"mean_token_accuracy": 0.7125938832759857,
|
|
"num_tokens": 455141874.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 0.6765067650676507,
|
|
"grad_norm": 0.11080878427089126,
|
|
"learning_rate": 3.4116528110469217e-06,
|
|
"loss": 1.1626,
|
|
"mean_token_accuracy": 0.7207128584384919,
|
|
"num_tokens": 456058190.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 0.6778734454011207,
|
|
"grad_norm": 0.13840426254215915,
|
|
"learning_rate": 3.408130195857405e-06,
|
|
"loss": 1.1983,
|
|
"mean_token_accuracy": 0.7112608909606933,
|
|
"num_tokens": 456928962.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 0.6792401257345907,
|
|
"grad_norm": 0.15361762093244,
|
|
"learning_rate": 3.404607580667888e-06,
|
|
"loss": 1.2241,
|
|
"mean_token_accuracy": 0.7069453239440918,
|
|
"num_tokens": 457856361.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 0.6806068060680607,
|
|
"grad_norm": 0.22396039636956702,
|
|
"learning_rate": 3.401084965478371e-06,
|
|
"loss": 1.1757,
|
|
"mean_token_accuracy": 0.7176157772541046,
|
|
"num_tokens": 458756840.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.6819734864015307,
|
|
"grad_norm": 0.11583482430732897,
|
|
"learning_rate": 3.397562350288855e-06,
|
|
"loss": 1.2354,
|
|
"mean_token_accuracy": 0.7063021421432495,
|
|
"num_tokens": 459682905.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 0.6833401667350006,
|
|
"grad_norm": 0.12257947198058074,
|
|
"learning_rate": 3.394039735099338e-06,
|
|
"loss": 1.2103,
|
|
"mean_token_accuracy": 0.7084886729717255,
|
|
"num_tokens": 460604101.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 0.6847068470684707,
|
|
"grad_norm": 0.15792035158375167,
|
|
"learning_rate": 3.3905171199098213e-06,
|
|
"loss": 1.1737,
|
|
"mean_token_accuracy": 0.7141755223274231,
|
|
"num_tokens": 461512991.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.6860735274019407,
|
|
"grad_norm": 0.11074968416106187,
|
|
"learning_rate": 3.3869945047203044e-06,
|
|
"loss": 1.2039,
|
|
"mean_token_accuracy": 0.7091189026832581,
|
|
"num_tokens": 462454053.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.6874402077354107,
|
|
"grad_norm": 0.11656073666042843,
|
|
"learning_rate": 3.3834718895307876e-06,
|
|
"loss": 1.2189,
|
|
"mean_token_accuracy": 0.7055296778678894,
|
|
"num_tokens": 463360778.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.6888068880688807,
|
|
"grad_norm": 0.1100303052872263,
|
|
"learning_rate": 3.3799492743412715e-06,
|
|
"loss": 1.2385,
|
|
"mean_token_accuracy": 0.7048618495464325,
|
|
"num_tokens": 464284769.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.6901735684023507,
|
|
"grad_norm": 0.12393181103653653,
|
|
"learning_rate": 3.3764266591517547e-06,
|
|
"loss": 1.1963,
|
|
"mean_token_accuracy": 0.7116320490837097,
|
|
"num_tokens": 465220861.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.6915402487358207,
|
|
"grad_norm": 0.13009338844543963,
|
|
"learning_rate": 3.3729040439622378e-06,
|
|
"loss": 1.2085,
|
|
"mean_token_accuracy": 0.7077995598316192,
|
|
"num_tokens": 466152597.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.6929069290692907,
|
|
"grad_norm": 0.14526147616984425,
|
|
"learning_rate": 3.3693814287727213e-06,
|
|
"loss": 1.2378,
|
|
"mean_token_accuracy": 0.7020161926746369,
|
|
"num_tokens": 467052088.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 1.25078125,
|
|
"epoch": 0.6942736094027607,
|
|
"grad_norm": 0.12289020622261099,
|
|
"learning_rate": 3.3658588135832044e-06,
|
|
"loss": 1.2563,
|
|
"mean_token_accuracy": 0.701757138967514,
|
|
"num_tokens": 468034958.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.6956402897362307,
|
|
"grad_norm": 0.11691953335607043,
|
|
"learning_rate": 3.3623361983936876e-06,
|
|
"loss": 1.2132,
|
|
"mean_token_accuracy": 0.7101219296455383,
|
|
"num_tokens": 469021348.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.6970069700697007,
|
|
"grad_norm": 0.11560840418043987,
|
|
"learning_rate": 3.358813583204171e-06,
|
|
"loss": 1.2333,
|
|
"mean_token_accuracy": 0.7035651683807373,
|
|
"num_tokens": 469996550.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 1.28203125,
|
|
"epoch": 0.6983736504031707,
|
|
"grad_norm": 0.1266125696176154,
|
|
"learning_rate": 3.3552909680146547e-06,
|
|
"loss": 1.2839,
|
|
"mean_token_accuracy": 0.6961874008178711,
|
|
"num_tokens": 470920985.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.6997403307366407,
|
|
"grad_norm": 0.1225203921853884,
|
|
"learning_rate": 3.3517683528251378e-06,
|
|
"loss": 1.1873,
|
|
"mean_token_accuracy": 0.7122327089309692,
|
|
"num_tokens": 471845765.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 0.7011070110701108,
|
|
"grad_norm": 0.13731756350916546,
|
|
"learning_rate": 3.348245737635621e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7142041504383088,
|
|
"num_tokens": 472791415.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.7024736914035807,
|
|
"grad_norm": 0.10764950930424848,
|
|
"learning_rate": 3.344723122446104e-06,
|
|
"loss": 1.2359,
|
|
"mean_token_accuracy": 0.7033789992332459,
|
|
"num_tokens": 473685911.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.7038403717370507,
|
|
"grad_norm": 0.14132523520506943,
|
|
"learning_rate": 3.341200507256587e-06,
|
|
"loss": 1.2529,
|
|
"mean_token_accuracy": 0.7023256063461304,
|
|
"num_tokens": 474598701.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 1.2578125,
|
|
"epoch": 0.7052070520705207,
|
|
"grad_norm": 0.12388830950041563,
|
|
"learning_rate": 3.337677892067071e-06,
|
|
"loss": 1.2548,
|
|
"mean_token_accuracy": 0.7027208983898163,
|
|
"num_tokens": 475505552.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 0.7065737324039907,
|
|
"grad_norm": 0.12795739899894648,
|
|
"learning_rate": 3.3341552768775543e-06,
|
|
"loss": 1.2474,
|
|
"mean_token_accuracy": 0.6991074502468109,
|
|
"num_tokens": 476440433.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.7079404127374607,
|
|
"grad_norm": 0.1171381122039063,
|
|
"learning_rate": 3.3306326616880374e-06,
|
|
"loss": 1.2414,
|
|
"mean_token_accuracy": 0.7032878875732422,
|
|
"num_tokens": 477364926.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.7093070930709308,
|
|
"grad_norm": 0.11139803149404429,
|
|
"learning_rate": 3.3271100464985205e-06,
|
|
"loss": 1.2646,
|
|
"mean_token_accuracy": 0.7013797461986542,
|
|
"num_tokens": 478303729.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 0.7106737734044007,
|
|
"grad_norm": 0.11648880234969633,
|
|
"learning_rate": 3.323587431309004e-06,
|
|
"loss": 1.2162,
|
|
"mean_token_accuracy": 0.7110657870769501,
|
|
"num_tokens": 479265206.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 1.26171875,
|
|
"epoch": 0.7120404537378707,
|
|
"grad_norm": 0.11945768048247357,
|
|
"learning_rate": 3.3200648161194876e-06,
|
|
"loss": 1.2514,
|
|
"mean_token_accuracy": 0.699717915058136,
|
|
"num_tokens": 480196185.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.7134071340713407,
|
|
"grad_norm": 0.12175334891734836,
|
|
"learning_rate": 3.3165422009299707e-06,
|
|
"loss": 1.2261,
|
|
"mean_token_accuracy": 0.7066967844963074,
|
|
"num_tokens": 481119022.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 0.7147738144048107,
|
|
"grad_norm": 0.11476715820809719,
|
|
"learning_rate": 3.313019585740454e-06,
|
|
"loss": 1.2312,
|
|
"mean_token_accuracy": 0.70535329580307,
|
|
"num_tokens": 482034528.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 1.25546875,
|
|
"epoch": 0.7161404947382807,
|
|
"grad_norm": 0.12817309557237033,
|
|
"learning_rate": 3.3094969705509374e-06,
|
|
"loss": 1.2678,
|
|
"mean_token_accuracy": 0.6970474898815155,
|
|
"num_tokens": 482963072.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.7175071750717508,
|
|
"grad_norm": 0.11618018978235811,
|
|
"learning_rate": 3.3059743553614205e-06,
|
|
"loss": 1.2149,
|
|
"mean_token_accuracy": 0.7080328524112701,
|
|
"num_tokens": 483869466.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.7188738554052208,
|
|
"grad_norm": 0.13732147972976244,
|
|
"learning_rate": 3.3024517401719036e-06,
|
|
"loss": 1.1959,
|
|
"mean_token_accuracy": 0.7101061642169952,
|
|
"num_tokens": 484772219.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.7202405357386907,
|
|
"grad_norm": 0.12373824830435415,
|
|
"learning_rate": 3.298929124982387e-06,
|
|
"loss": 1.2381,
|
|
"mean_token_accuracy": 0.708605146408081,
|
|
"num_tokens": 485705049.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 1.25078125,
|
|
"epoch": 0.7216072160721607,
|
|
"grad_norm": 0.1288287131929968,
|
|
"learning_rate": 3.2954065097928707e-06,
|
|
"loss": 1.267,
|
|
"mean_token_accuracy": 0.6997636556625366,
|
|
"num_tokens": 486647525.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 0.7229738964056307,
|
|
"grad_norm": 0.11629516629053452,
|
|
"learning_rate": 3.291883894603354e-06,
|
|
"loss": 1.2467,
|
|
"mean_token_accuracy": 0.7045997560024262,
|
|
"num_tokens": 487571730.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 0.7243405767391007,
|
|
"grad_norm": 0.11978115581947732,
|
|
"learning_rate": 3.288361279413837e-06,
|
|
"loss": 1.1729,
|
|
"mean_token_accuracy": 0.7136492431163788,
|
|
"num_tokens": 488486162.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.7257072570725708,
|
|
"grad_norm": 0.11497008492898537,
|
|
"learning_rate": 3.28483866422432e-06,
|
|
"loss": 1.1946,
|
|
"mean_token_accuracy": 0.713703316450119,
|
|
"num_tokens": 489397994.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.7270739374060408,
|
|
"grad_norm": 0.1238354185775525,
|
|
"learning_rate": 3.281316049034804e-06,
|
|
"loss": 1.2543,
|
|
"mean_token_accuracy": 0.6984295070171356,
|
|
"num_tokens": 490378487.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.7284406177395107,
|
|
"grad_norm": 0.12084801334951231,
|
|
"learning_rate": 3.277793433845287e-06,
|
|
"loss": 1.2113,
|
|
"mean_token_accuracy": 0.7047815501689911,
|
|
"num_tokens": 491314657.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.7298072980729807,
|
|
"grad_norm": 0.12552033534560103,
|
|
"learning_rate": 3.2742708186557703e-06,
|
|
"loss": 1.202,
|
|
"mean_token_accuracy": 0.7128627836704254,
|
|
"num_tokens": 492253180.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.7311739784064507,
|
|
"grad_norm": 0.1622472683516829,
|
|
"learning_rate": 3.2707482034662534e-06,
|
|
"loss": 1.2276,
|
|
"mean_token_accuracy": 0.7037956833839416,
|
|
"num_tokens": 493185858.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 0.7325406587399207,
|
|
"grad_norm": 0.12479484799433292,
|
|
"learning_rate": 3.2672255882767366e-06,
|
|
"loss": 1.2135,
|
|
"mean_token_accuracy": 0.711555677652359,
|
|
"num_tokens": 494110299.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.7339073390733908,
|
|
"grad_norm": 0.11292106387680623,
|
|
"learning_rate": 3.26370297308722e-06,
|
|
"loss": 1.2045,
|
|
"mean_token_accuracy": 0.7099689483642578,
|
|
"num_tokens": 495063936.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.7352740194068608,
|
|
"grad_norm": 0.12266697947007943,
|
|
"learning_rate": 3.2601803578977037e-06,
|
|
"loss": 1.2246,
|
|
"mean_token_accuracy": 0.7046516478061676,
|
|
"num_tokens": 495982933.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 0.7366406997403308,
|
|
"grad_norm": 0.10884354334044018,
|
|
"learning_rate": 3.2566577427081868e-06,
|
|
"loss": 1.1673,
|
|
"mean_token_accuracy": 0.7160395622253418,
|
|
"num_tokens": 496933673.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 0.7380073800738007,
|
|
"grad_norm": 0.11591395508864376,
|
|
"learning_rate": 3.25313512751867e-06,
|
|
"loss": 1.2079,
|
|
"mean_token_accuracy": 0.7086668014526367,
|
|
"num_tokens": 497833208.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 1.25859375,
|
|
"epoch": 0.7393740604072707,
|
|
"grad_norm": 0.11955890751973734,
|
|
"learning_rate": 3.2496125123291535e-06,
|
|
"loss": 1.2698,
|
|
"mean_token_accuracy": 0.6981489002704621,
|
|
"num_tokens": 498759050.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 0.12199863825456295,
|
|
"learning_rate": 3.2460898971396366e-06,
|
|
"loss": 1.2005,
|
|
"mean_token_accuracy": 0.7135752856731414,
|
|
"num_tokens": 499683263.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 0.7421074210742108,
|
|
"grad_norm": 0.16220258397633874,
|
|
"learning_rate": 3.24256728195012e-06,
|
|
"loss": 1.2201,
|
|
"mean_token_accuracy": 0.7069534718990326,
|
|
"num_tokens": 500587418.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.7434741014076808,
|
|
"grad_norm": 0.13263466162301032,
|
|
"learning_rate": 3.2390446667606037e-06,
|
|
"loss": 1.2132,
|
|
"mean_token_accuracy": 0.7076795816421508,
|
|
"num_tokens": 501538593.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.7448407817411508,
|
|
"grad_norm": 0.11217013421891467,
|
|
"learning_rate": 3.235522051571087e-06,
|
|
"loss": 1.2093,
|
|
"mean_token_accuracy": 0.7108061909675598,
|
|
"num_tokens": 502458567.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.7462074620746207,
|
|
"grad_norm": 0.15699336685320758,
|
|
"learning_rate": 3.23199943638157e-06,
|
|
"loss": 1.2551,
|
|
"mean_token_accuracy": 0.7018289923667907,
|
|
"num_tokens": 503343942.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 1.2453125,
|
|
"epoch": 0.7475741424080907,
|
|
"grad_norm": 0.11970689567983965,
|
|
"learning_rate": 3.228476821192053e-06,
|
|
"loss": 1.2705,
|
|
"mean_token_accuracy": 0.6990472137928009,
|
|
"num_tokens": 504273657.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.7489408227415607,
|
|
"grad_norm": 0.1336025529103702,
|
|
"learning_rate": 3.224954206002536e-06,
|
|
"loss": 1.244,
|
|
"mean_token_accuracy": 0.7030104517936706,
|
|
"num_tokens": 505223492.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 1.202734375,
|
|
"epoch": 0.7503075030750308,
|
|
"grad_norm": 0.1205158087402134,
|
|
"learning_rate": 3.22143159081302e-06,
|
|
"loss": 1.2037,
|
|
"mean_token_accuracy": 0.7119066655635834,
|
|
"num_tokens": 506149685.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 1.26015625,
|
|
"epoch": 0.7516741834085008,
|
|
"grad_norm": 0.13107275333802126,
|
|
"learning_rate": 3.2179089756235033e-06,
|
|
"loss": 1.2527,
|
|
"mean_token_accuracy": 0.7006147742271424,
|
|
"num_tokens": 507071941.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.7530408637419708,
|
|
"grad_norm": 0.12175972446282311,
|
|
"learning_rate": 3.2143863604339864e-06,
|
|
"loss": 1.2592,
|
|
"mean_token_accuracy": 0.6999919772148132,
|
|
"num_tokens": 508013836.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 0.7544075440754408,
|
|
"grad_norm": 0.18732728022243378,
|
|
"learning_rate": 3.2108637452444695e-06,
|
|
"loss": 1.2416,
|
|
"mean_token_accuracy": 0.7048626244068146,
|
|
"num_tokens": 508928276.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 1.2546875,
|
|
"epoch": 0.7557742244089107,
|
|
"grad_norm": 0.12047589263797466,
|
|
"learning_rate": 3.2073411300549526e-06,
|
|
"loss": 1.2573,
|
|
"mean_token_accuracy": 0.7026072084903717,
|
|
"num_tokens": 509829624.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 0.7571409047423807,
|
|
"grad_norm": 0.11267520330380569,
|
|
"learning_rate": 3.2038185148654366e-06,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.7140181303024292,
|
|
"num_tokens": 510741617.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 1.23828125,
|
|
"epoch": 0.7585075850758508,
|
|
"grad_norm": 0.12463760813474879,
|
|
"learning_rate": 3.2002958996759197e-06,
|
|
"loss": 1.2468,
|
|
"mean_token_accuracy": 0.7017064452171325,
|
|
"num_tokens": 511633875.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.7598742654093208,
|
|
"grad_norm": 0.13350413498532687,
|
|
"learning_rate": 3.196773284486403e-06,
|
|
"loss": 1.2455,
|
|
"mean_token_accuracy": 0.704252815246582,
|
|
"num_tokens": 512550694.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 1.25703125,
|
|
"epoch": 0.7612409457427908,
|
|
"grad_norm": 0.2149353282277156,
|
|
"learning_rate": 3.1932506692968864e-06,
|
|
"loss": 1.2617,
|
|
"mean_token_accuracy": 0.698896062374115,
|
|
"num_tokens": 513493021.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.7626076260762608,
|
|
"grad_norm": 0.11046533875327744,
|
|
"learning_rate": 3.1897280541073695e-06,
|
|
"loss": 1.2246,
|
|
"mean_token_accuracy": 0.7070587396621704,
|
|
"num_tokens": 514438780.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.7639743064097307,
|
|
"grad_norm": 0.11471775273965303,
|
|
"learning_rate": 3.1862054389178526e-06,
|
|
"loss": 1.2416,
|
|
"mean_token_accuracy": 0.7044345378875733,
|
|
"num_tokens": 515416366.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 0.7653409867432007,
|
|
"grad_norm": 0.1281202501882318,
|
|
"learning_rate": 3.182682823728336e-06,
|
|
"loss": 1.1984,
|
|
"mean_token_accuracy": 0.7085534512996674,
|
|
"num_tokens": 516348299.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.7667076670766708,
|
|
"grad_norm": 0.1084249354072781,
|
|
"learning_rate": 3.1791602085388197e-06,
|
|
"loss": 1.1981,
|
|
"mean_token_accuracy": 0.7123336374759675,
|
|
"num_tokens": 936433.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 0.7680743474101408,
|
|
"grad_norm": 0.1118579294168003,
|
|
"learning_rate": 3.175637593349303e-06,
|
|
"loss": 1.265,
|
|
"mean_token_accuracy": 0.6998822033405304,
|
|
"num_tokens": 1835907.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 1.2421875,
|
|
"epoch": 0.7694410277436108,
|
|
"grad_norm": 0.10953118401036148,
|
|
"learning_rate": 3.172114978159786e-06,
|
|
"loss": 1.2468,
|
|
"mean_token_accuracy": 0.70295569896698,
|
|
"num_tokens": 2784865.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.7708077080770808,
|
|
"grad_norm": 0.12202011946782201,
|
|
"learning_rate": 3.168592362970269e-06,
|
|
"loss": 1.2385,
|
|
"mean_token_accuracy": 0.7062244355678559,
|
|
"num_tokens": 3669771.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.7721743884105507,
|
|
"grad_norm": 0.12574319530096023,
|
|
"learning_rate": 3.165069747780753e-06,
|
|
"loss": 1.2075,
|
|
"mean_token_accuracy": 0.7103377342224121,
|
|
"num_tokens": 4561959.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 0.7735410687440207,
|
|
"grad_norm": 0.13829311757575252,
|
|
"learning_rate": 3.161547132591236e-06,
|
|
"loss": 1.2618,
|
|
"mean_token_accuracy": 0.6996250331401825,
|
|
"num_tokens": 5509796.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.7749077490774908,
|
|
"grad_norm": 0.13404396756703624,
|
|
"learning_rate": 3.1580245174017193e-06,
|
|
"loss": 1.2517,
|
|
"mean_token_accuracy": 0.7019116401672363,
|
|
"num_tokens": 6447819.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 1.2453125,
|
|
"epoch": 0.7762744294109608,
|
|
"grad_norm": 0.12497396017589466,
|
|
"learning_rate": 3.1545019022122025e-06,
|
|
"loss": 1.2506,
|
|
"mean_token_accuracy": 0.7023446917533874,
|
|
"num_tokens": 7369230.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 1.2625,
|
|
"epoch": 0.7776411097444308,
|
|
"grad_norm": 0.11582358402385572,
|
|
"learning_rate": 3.1509792870226856e-06,
|
|
"loss": 1.2733,
|
|
"mean_token_accuracy": 0.697501665353775,
|
|
"num_tokens": 8301261.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.7790077900779008,
|
|
"grad_norm": 0.18232600118597048,
|
|
"learning_rate": 3.147456671833169e-06,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.7105979442596435,
|
|
"num_tokens": 9213381.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 0.7803744704113708,
|
|
"grad_norm": 0.11929449046087212,
|
|
"learning_rate": 3.1439340566436527e-06,
|
|
"loss": 1.2088,
|
|
"mean_token_accuracy": 0.708895868062973,
|
|
"num_tokens": 10139468.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 1.234375,
|
|
"epoch": 0.7817411507448407,
|
|
"grad_norm": 0.14431471461023768,
|
|
"learning_rate": 3.140411441454136e-06,
|
|
"loss": 1.2433,
|
|
"mean_token_accuracy": 0.7043908059597015,
|
|
"num_tokens": 11047461.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.7831078310783108,
|
|
"grad_norm": 0.12490485074617211,
|
|
"learning_rate": 3.136888826264619e-06,
|
|
"loss": 1.2298,
|
|
"mean_token_accuracy": 0.7041331231594086,
|
|
"num_tokens": 11967688.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.7844745114117808,
|
|
"grad_norm": 0.12468606371381945,
|
|
"learning_rate": 3.1333662110751025e-06,
|
|
"loss": 1.2019,
|
|
"mean_token_accuracy": 0.7100453913211823,
|
|
"num_tokens": 12874594.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 0.7858411917452508,
|
|
"grad_norm": 0.12303790638337361,
|
|
"learning_rate": 3.1298435958855856e-06,
|
|
"loss": 1.1679,
|
|
"mean_token_accuracy": 0.7174359023571014,
|
|
"num_tokens": 13753551.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 0.7872078720787208,
|
|
"grad_norm": 0.2136598166384058,
|
|
"learning_rate": 3.126320980696069e-06,
|
|
"loss": 1.2397,
|
|
"mean_token_accuracy": 0.7040407478809356,
|
|
"num_tokens": 14652356.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.7885745524121908,
|
|
"grad_norm": 0.11317139738672886,
|
|
"learning_rate": 3.1227983655065523e-06,
|
|
"loss": 1.2277,
|
|
"mean_token_accuracy": 0.7054188191890717,
|
|
"num_tokens": 15554754.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.7899412327456607,
|
|
"grad_norm": 0.1284689720452441,
|
|
"learning_rate": 3.119275750317036e-06,
|
|
"loss": 1.1648,
|
|
"mean_token_accuracy": 0.7164071798324585,
|
|
"num_tokens": 16468872.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 1.26875,
|
|
"epoch": 0.7913079130791308,
|
|
"grad_norm": 0.13323326438863328,
|
|
"learning_rate": 3.115753135127519e-06,
|
|
"loss": 1.2769,
|
|
"mean_token_accuracy": 0.6958516895771026,
|
|
"num_tokens": 17400201.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.7926745934126008,
|
|
"grad_norm": 0.12824160443739493,
|
|
"learning_rate": 3.112230519938002e-06,
|
|
"loss": 1.2188,
|
|
"mean_token_accuracy": 0.7081718623638154,
|
|
"num_tokens": 18353537.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 0.7940412737460708,
|
|
"grad_norm": 0.13936505835915883,
|
|
"learning_rate": 3.108707904748485e-06,
|
|
"loss": 1.2017,
|
|
"mean_token_accuracy": 0.7112892985343933,
|
|
"num_tokens": 19211887.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 0.7954079540795408,
|
|
"grad_norm": 0.12727848417819188,
|
|
"learning_rate": 3.105185289558969e-06,
|
|
"loss": 1.2811,
|
|
"mean_token_accuracy": 0.6982558071613312,
|
|
"num_tokens": 20174696.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.7967746344130108,
|
|
"grad_norm": 0.1359327116252569,
|
|
"learning_rate": 3.1016626743694523e-06,
|
|
"loss": 1.2037,
|
|
"mean_token_accuracy": 0.7091135144233703,
|
|
"num_tokens": 21097709.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 0.7981413147464808,
|
|
"grad_norm": 0.10525853212342348,
|
|
"learning_rate": 3.0981400591799354e-06,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.7118173897266388,
|
|
"num_tokens": 22072723.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.7995079950799509,
|
|
"grad_norm": 0.1323032485053306,
|
|
"learning_rate": 3.0946174439904185e-06,
|
|
"loss": 1.1956,
|
|
"mean_token_accuracy": 0.7136429071426391,
|
|
"num_tokens": 23009806.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.8008746754134208,
|
|
"grad_norm": 0.11707203119405121,
|
|
"learning_rate": 3.0910948288009016e-06,
|
|
"loss": 1.1828,
|
|
"mean_token_accuracy": 0.7158657014369965,
|
|
"num_tokens": 23898755.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.8022413557468908,
|
|
"grad_norm": 0.11738354220208877,
|
|
"learning_rate": 3.0875722136113856e-06,
|
|
"loss": 1.2116,
|
|
"mean_token_accuracy": 0.70728879570961,
|
|
"num_tokens": 24815131.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.8036080360803608,
|
|
"grad_norm": 0.11942536742490899,
|
|
"learning_rate": 3.0840495984218687e-06,
|
|
"loss": 1.2247,
|
|
"mean_token_accuracy": 0.7067735016345977,
|
|
"num_tokens": 25759628.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.8049747164138308,
|
|
"grad_norm": 0.11001772588460024,
|
|
"learning_rate": 3.080526983232352e-06,
|
|
"loss": 1.2349,
|
|
"mean_token_accuracy": 0.7031912922859191,
|
|
"num_tokens": 26671114.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.8063413967473008,
|
|
"grad_norm": 0.11070871438081074,
|
|
"learning_rate": 3.0770043680428354e-06,
|
|
"loss": 1.2145,
|
|
"mean_token_accuracy": 0.7071738123893738,
|
|
"num_tokens": 27599083.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.8077080770807709,
|
|
"grad_norm": 0.13609660883493602,
|
|
"learning_rate": 3.0734817528533185e-06,
|
|
"loss": 1.215,
|
|
"mean_token_accuracy": 0.7099611461162567,
|
|
"num_tokens": 28499262.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 0.8090747574142408,
|
|
"grad_norm": 0.13535914177534294,
|
|
"learning_rate": 3.0699591376638017e-06,
|
|
"loss": 1.2363,
|
|
"mean_token_accuracy": 0.7042745292186737,
|
|
"num_tokens": 29408716.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 0.8104414377477108,
|
|
"grad_norm": 0.11918403048104813,
|
|
"learning_rate": 3.066436522474285e-06,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.7161609053611755,
|
|
"num_tokens": 30324949.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 0.8118081180811808,
|
|
"grad_norm": 0.14836528382881575,
|
|
"learning_rate": 3.0629139072847688e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7139535129070282,
|
|
"num_tokens": 31302726.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.8131747984146508,
|
|
"grad_norm": 0.12109733422924294,
|
|
"learning_rate": 3.059391292095252e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7050518810749054,
|
|
"num_tokens": 32205195.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.8145414787481208,
|
|
"grad_norm": 0.11251255072895153,
|
|
"learning_rate": 3.055868676905735e-06,
|
|
"loss": 1.2383,
|
|
"mean_token_accuracy": 0.7049329936504364,
|
|
"num_tokens": 33175022.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.8159081590815909,
|
|
"grad_norm": 0.1215845509540891,
|
|
"learning_rate": 3.052346061716218e-06,
|
|
"loss": 1.2328,
|
|
"mean_token_accuracy": 0.7073344469070435,
|
|
"num_tokens": 34034198.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 0.8172748394150608,
|
|
"grad_norm": 0.13386224007289227,
|
|
"learning_rate": 3.048823446526702e-06,
|
|
"loss": 1.2437,
|
|
"mean_token_accuracy": 0.7041427493095398,
|
|
"num_tokens": 34966893.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 1.25078125,
|
|
"epoch": 0.8186415197485308,
|
|
"grad_norm": 0.12790149819479815,
|
|
"learning_rate": 3.0453008313371852e-06,
|
|
"loss": 1.2583,
|
|
"mean_token_accuracy": 0.7024395644664765,
|
|
"num_tokens": 35935178.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 0.8200082000820008,
|
|
"grad_norm": 0.12522470670114147,
|
|
"learning_rate": 3.0417782161476683e-06,
|
|
"loss": 1.206,
|
|
"mean_token_accuracy": 0.7089483678340912,
|
|
"num_tokens": 36846585.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 1.27265625,
|
|
"epoch": 0.8213748804154708,
|
|
"grad_norm": 0.11709036339055638,
|
|
"learning_rate": 3.0382556009581515e-06,
|
|
"loss": 1.288,
|
|
"mean_token_accuracy": 0.6958957016468048,
|
|
"num_tokens": 37759237.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 0.8227415607489408,
|
|
"grad_norm": 0.1095426491063743,
|
|
"learning_rate": 3.0347329857686346e-06,
|
|
"loss": 1.1887,
|
|
"mean_token_accuracy": 0.7121895492076874,
|
|
"num_tokens": 38646491.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 0.8241082410824109,
|
|
"grad_norm": 0.13174626172676543,
|
|
"learning_rate": 3.031210370579118e-06,
|
|
"loss": 1.2046,
|
|
"mean_token_accuracy": 0.7138250410556793,
|
|
"num_tokens": 39569590.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 0.8254749214158809,
|
|
"grad_norm": 0.11845232652594334,
|
|
"learning_rate": 3.0276877553896017e-06,
|
|
"loss": 1.2191,
|
|
"mean_token_accuracy": 0.705291360616684,
|
|
"num_tokens": 40494227.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.8268416017493508,
|
|
"grad_norm": 0.47201133801341305,
|
|
"learning_rate": 3.024165140200085e-06,
|
|
"loss": 1.2153,
|
|
"mean_token_accuracy": 0.7094224333763123,
|
|
"num_tokens": 41406957.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 0.8282082820828208,
|
|
"grad_norm": 0.12058201998366966,
|
|
"learning_rate": 3.020642525010568e-06,
|
|
"loss": 1.2432,
|
|
"mean_token_accuracy": 0.7043605089187622,
|
|
"num_tokens": 42312109.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.8295749624162908,
|
|
"grad_norm": 0.12932457578303233,
|
|
"learning_rate": 3.0171199098210515e-06,
|
|
"loss": 1.1925,
|
|
"mean_token_accuracy": 0.7127811729907989,
|
|
"num_tokens": 43205327.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.8309416427497608,
|
|
"grad_norm": 0.12216616811746174,
|
|
"learning_rate": 3.0135972946315346e-06,
|
|
"loss": 1.2181,
|
|
"mean_token_accuracy": 0.7087445318698883,
|
|
"num_tokens": 44132350.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 0.8323083230832309,
|
|
"grad_norm": 0.11036947215898064,
|
|
"learning_rate": 3.010074679442018e-06,
|
|
"loss": 1.2249,
|
|
"mean_token_accuracy": 0.7074347078800202,
|
|
"num_tokens": 45027922.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.8336750034167009,
|
|
"grad_norm": 0.22380837981469265,
|
|
"learning_rate": 3.0065520642525013e-06,
|
|
"loss": 1.2304,
|
|
"mean_token_accuracy": 0.7057975113391877,
|
|
"num_tokens": 45926101.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 1.246875,
|
|
"epoch": 0.8350416837501708,
|
|
"grad_norm": 0.15585447831546495,
|
|
"learning_rate": 3.003029449062985e-06,
|
|
"loss": 1.2462,
|
|
"mean_token_accuracy": 0.7025263249874115,
|
|
"num_tokens": 46867327.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.8364083640836408,
|
|
"grad_norm": 0.12913103394621073,
|
|
"learning_rate": 2.999506833873468e-06,
|
|
"loss": 1.2188,
|
|
"mean_token_accuracy": 0.7073303401470185,
|
|
"num_tokens": 47743014.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.8377750444171108,
|
|
"grad_norm": 0.12031212640164285,
|
|
"learning_rate": 2.995984218683951e-06,
|
|
"loss": 1.237,
|
|
"mean_token_accuracy": 0.7024833381175994,
|
|
"num_tokens": 48663996.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 1.207421875,
|
|
"epoch": 0.8391417247505808,
|
|
"grad_norm": 0.1181774565430487,
|
|
"learning_rate": 2.992461603494434e-06,
|
|
"loss": 1.2176,
|
|
"mean_token_accuracy": 0.7058593213558197,
|
|
"num_tokens": 49606245.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 0.8405084050840509,
|
|
"grad_norm": 0.12196112213325576,
|
|
"learning_rate": 2.988938988304918e-06,
|
|
"loss": 1.213,
|
|
"mean_token_accuracy": 0.7080254197120667,
|
|
"num_tokens": 50546391.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.8418750854175209,
|
|
"grad_norm": 0.16908328007555462,
|
|
"learning_rate": 2.9854163731154013e-06,
|
|
"loss": 1.221,
|
|
"mean_token_accuracy": 0.7092726647853851,
|
|
"num_tokens": 51454783.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 0.8432417657509909,
|
|
"grad_norm": 0.11208771832046475,
|
|
"learning_rate": 2.9818937579258844e-06,
|
|
"loss": 1.2218,
|
|
"mean_token_accuracy": 0.7059466123580933,
|
|
"num_tokens": 52328523.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 0.8446084460844608,
|
|
"grad_norm": 0.1389927447277988,
|
|
"learning_rate": 2.9783711427363675e-06,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.7093114197254181,
|
|
"num_tokens": 53254038.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.8459751264179308,
|
|
"grad_norm": 0.12544284077513815,
|
|
"learning_rate": 2.9748485275468507e-06,
|
|
"loss": 1.2214,
|
|
"mean_token_accuracy": 0.708537882566452,
|
|
"num_tokens": 54165620.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 0.8473418067514008,
|
|
"grad_norm": 0.11383329296223385,
|
|
"learning_rate": 2.9713259123573346e-06,
|
|
"loss": 1.2163,
|
|
"mean_token_accuracy": 0.7092505097389221,
|
|
"num_tokens": 55045916.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.8487084870848709,
|
|
"grad_norm": 0.10982113518551015,
|
|
"learning_rate": 2.9678032971678177e-06,
|
|
"loss": 1.1946,
|
|
"mean_token_accuracy": 0.7109391212463378,
|
|
"num_tokens": 55917200.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.8500751674183409,
|
|
"grad_norm": 0.11586858936334594,
|
|
"learning_rate": 2.964280681978301e-06,
|
|
"loss": 1.2169,
|
|
"mean_token_accuracy": 0.7050465226173401,
|
|
"num_tokens": 56867516.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.8514418477518109,
|
|
"grad_norm": 0.17806770374687542,
|
|
"learning_rate": 2.960758066788784e-06,
|
|
"loss": 1.1826,
|
|
"mean_token_accuracy": 0.712931489944458,
|
|
"num_tokens": 57747545.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 0.8528085280852808,
|
|
"grad_norm": 0.11649470195070688,
|
|
"learning_rate": 2.9572354515992675e-06,
|
|
"loss": 1.2435,
|
|
"mean_token_accuracy": 0.7018972158432006,
|
|
"num_tokens": 58642544.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 0.8541752084187508,
|
|
"grad_norm": 0.1536329106125164,
|
|
"learning_rate": 2.9537128364097507e-06,
|
|
"loss": 1.143,
|
|
"mean_token_accuracy": 0.7252349972724914,
|
|
"num_tokens": 59562867.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.8555418887522208,
|
|
"grad_norm": 0.11246899037156909,
|
|
"learning_rate": 2.9501902212202342e-06,
|
|
"loss": 1.2269,
|
|
"mean_token_accuracy": 0.7061719954013824,
|
|
"num_tokens": 60516859.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.8569085690856909,
|
|
"grad_norm": 0.11940572562386943,
|
|
"learning_rate": 2.9466676060307178e-06,
|
|
"loss": 1.2185,
|
|
"mean_token_accuracy": 0.7062341749668122,
|
|
"num_tokens": 61450444.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.8582752494191609,
|
|
"grad_norm": 0.11920154134314857,
|
|
"learning_rate": 2.943144990841201e-06,
|
|
"loss": 1.2412,
|
|
"mean_token_accuracy": 0.7046826899051666,
|
|
"num_tokens": 62388643.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 0.8596419297526309,
|
|
"grad_norm": 0.13507286186806805,
|
|
"learning_rate": 2.939622375651684e-06,
|
|
"loss": 1.2258,
|
|
"mean_token_accuracy": 0.7078295528888703,
|
|
"num_tokens": 63309459.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 0.8610086100861009,
|
|
"grad_norm": 0.11907084070644422,
|
|
"learning_rate": 2.936099760462167e-06,
|
|
"loss": 1.2298,
|
|
"mean_token_accuracy": 0.7053798019886017,
|
|
"num_tokens": 64275817.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 0.8623752904195708,
|
|
"grad_norm": 0.11844123534566155,
|
|
"learning_rate": 2.932577145272651e-06,
|
|
"loss": 1.2115,
|
|
"mean_token_accuracy": 0.7104539155960083,
|
|
"num_tokens": 65169404.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.8637419707530408,
|
|
"grad_norm": 0.1063744704798417,
|
|
"learning_rate": 2.9290545300831342e-06,
|
|
"loss": 1.236,
|
|
"mean_token_accuracy": 0.7045866191387177,
|
|
"num_tokens": 66110446.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 0.8651086510865109,
|
|
"grad_norm": 0.1399201783348144,
|
|
"learning_rate": 2.9255319148936174e-06,
|
|
"loss": 1.2352,
|
|
"mean_token_accuracy": 0.7067779123783111,
|
|
"num_tokens": 67029079.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 0.8664753314199809,
|
|
"grad_norm": 0.3955950066120214,
|
|
"learning_rate": 2.9220092997041005e-06,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.72397301197052,
|
|
"num_tokens": 67964405.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.8678420117534509,
|
|
"grad_norm": 0.1257181316822431,
|
|
"learning_rate": 2.9184866845145836e-06,
|
|
"loss": 1.2072,
|
|
"mean_token_accuracy": 0.7083312094211578,
|
|
"num_tokens": 68886366.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 0.8692086920869209,
|
|
"grad_norm": 0.1231659569138324,
|
|
"learning_rate": 2.9149640693250667e-06,
|
|
"loss": 1.2221,
|
|
"mean_token_accuracy": 0.7075818061828614,
|
|
"num_tokens": 69836929.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.8705753724203908,
|
|
"grad_norm": 0.12707013763523287,
|
|
"learning_rate": 2.9114414541355507e-06,
|
|
"loss": 1.2368,
|
|
"mean_token_accuracy": 0.7051069736480713,
|
|
"num_tokens": 70740719.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.8719420527538608,
|
|
"grad_norm": 0.12353098958555066,
|
|
"learning_rate": 2.907918838946034e-06,
|
|
"loss": 1.2365,
|
|
"mean_token_accuracy": 0.7067025423049926,
|
|
"num_tokens": 71669374.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.8733087330873309,
|
|
"grad_norm": 0.11659367456214134,
|
|
"learning_rate": 2.904396223756517e-06,
|
|
"loss": 1.2092,
|
|
"mean_token_accuracy": 0.7113011717796326,
|
|
"num_tokens": 72587426.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 1.25078125,
|
|
"epoch": 0.8746754134208009,
|
|
"grad_norm": 0.12811007646967335,
|
|
"learning_rate": 2.9008736085670005e-06,
|
|
"loss": 1.2464,
|
|
"mean_token_accuracy": 0.703271359205246,
|
|
"num_tokens": 73503639.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 0.8760420937542709,
|
|
"grad_norm": 0.12575158924130483,
|
|
"learning_rate": 2.8973509933774836e-06,
|
|
"loss": 1.1641,
|
|
"mean_token_accuracy": 0.7179130733013153,
|
|
"num_tokens": 74402342.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.8774087740877409,
|
|
"grad_norm": 0.11987916468966597,
|
|
"learning_rate": 2.893828378187967e-06,
|
|
"loss": 1.2325,
|
|
"mean_token_accuracy": 0.7049903213977814,
|
|
"num_tokens": 75343294.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.8787754544212109,
|
|
"grad_norm": 0.18934569475904667,
|
|
"learning_rate": 2.8903057629984503e-06,
|
|
"loss": 1.1792,
|
|
"mean_token_accuracy": 0.7150401592254638,
|
|
"num_tokens": 76237973.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 0.8801421347546808,
|
|
"grad_norm": 0.1509324925171426,
|
|
"learning_rate": 2.886783147808934e-06,
|
|
"loss": 1.1772,
|
|
"mean_token_accuracy": 0.7174163639545441,
|
|
"num_tokens": 77175007.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 0.8815088150881509,
|
|
"grad_norm": 0.15326330151324127,
|
|
"learning_rate": 2.883260532619417e-06,
|
|
"loss": 1.2422,
|
|
"mean_token_accuracy": 0.7017311334609986,
|
|
"num_tokens": 78066459.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 0.8828754954216209,
|
|
"grad_norm": 0.13383050075081648,
|
|
"learning_rate": 2.8797379174299e-06,
|
|
"loss": 1.1896,
|
|
"mean_token_accuracy": 0.7134045422077179,
|
|
"num_tokens": 78937672.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 0.8842421757550909,
|
|
"grad_norm": 0.13091202944168553,
|
|
"learning_rate": 2.876215302240383e-06,
|
|
"loss": 1.1846,
|
|
"mean_token_accuracy": 0.7172662794589997,
|
|
"num_tokens": 79848700.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 1.23828125,
|
|
"epoch": 0.8856088560885609,
|
|
"grad_norm": 0.1151596616654579,
|
|
"learning_rate": 2.872692687050867e-06,
|
|
"loss": 1.2364,
|
|
"mean_token_accuracy": 0.7053985297679901,
|
|
"num_tokens": 80800591.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 0.8869755364220309,
|
|
"grad_norm": 0.12074083860758246,
|
|
"learning_rate": 2.8691700718613503e-06,
|
|
"loss": 1.2133,
|
|
"mean_token_accuracy": 0.7070508122444152,
|
|
"num_tokens": 81715669.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 0.8883422167555008,
|
|
"grad_norm": 0.13454178126535296,
|
|
"learning_rate": 2.8656474566718334e-06,
|
|
"loss": 1.1872,
|
|
"mean_token_accuracy": 0.7108097434043884,
|
|
"num_tokens": 82668607.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 0.8897088970889709,
|
|
"grad_norm": 0.11522815541088786,
|
|
"learning_rate": 2.8621248414823165e-06,
|
|
"loss": 1.2009,
|
|
"mean_token_accuracy": 0.7096296310424804,
|
|
"num_tokens": 83584024.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.8910755774224409,
|
|
"grad_norm": 0.130549772069817,
|
|
"learning_rate": 2.8586022262927997e-06,
|
|
"loss": 1.2081,
|
|
"mean_token_accuracy": 0.7090779125690461,
|
|
"num_tokens": 84438991.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 1.27421875,
|
|
"epoch": 0.8924422577559109,
|
|
"grad_norm": 0.12080441701877721,
|
|
"learning_rate": 2.8550796111032836e-06,
|
|
"loss": 1.2807,
|
|
"mean_token_accuracy": 0.6944321811199188,
|
|
"num_tokens": 85370136.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.8938089380893809,
|
|
"grad_norm": 0.12165244909074693,
|
|
"learning_rate": 2.8515569959137668e-06,
|
|
"loss": 1.2268,
|
|
"mean_token_accuracy": 0.7048497855663299,
|
|
"num_tokens": 86281080.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.8951756184228509,
|
|
"grad_norm": 0.12495001404586616,
|
|
"learning_rate": 2.84803438072425e-06,
|
|
"loss": 1.1928,
|
|
"mean_token_accuracy": 0.7146258652210236,
|
|
"num_tokens": 87193165.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 0.8965422987563209,
|
|
"grad_norm": 0.13301803016949762,
|
|
"learning_rate": 2.844511765534733e-06,
|
|
"loss": 1.2065,
|
|
"mean_token_accuracy": 0.7099649906158447,
|
|
"num_tokens": 88107556.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 0.897908979089791,
|
|
"grad_norm": 0.10161974072534646,
|
|
"learning_rate": 2.8409891503452166e-06,
|
|
"loss": 1.1764,
|
|
"mean_token_accuracy": 0.7154452621936798,
|
|
"num_tokens": 89046707.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.8992756594232609,
|
|
"grad_norm": 0.15943296839658266,
|
|
"learning_rate": 2.8374665351556997e-06,
|
|
"loss": 1.2097,
|
|
"mean_token_accuracy": 0.7077414333820343,
|
|
"num_tokens": 89937903.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.9006423397567309,
|
|
"grad_norm": 0.11878187740924406,
|
|
"learning_rate": 2.8339439199661832e-06,
|
|
"loss": 1.2116,
|
|
"mean_token_accuracy": 0.7118341803550721,
|
|
"num_tokens": 90900318.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.9020090200902009,
|
|
"grad_norm": 0.12313796359317968,
|
|
"learning_rate": 2.8304213047766663e-06,
|
|
"loss": 1.229,
|
|
"mean_token_accuracy": 0.70570068359375,
|
|
"num_tokens": 91789659.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.9033757004236709,
|
|
"grad_norm": 0.14158852050037418,
|
|
"learning_rate": 2.82689868958715e-06,
|
|
"loss": 1.1717,
|
|
"mean_token_accuracy": 0.716703462600708,
|
|
"num_tokens": 92717250.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 0.9047423807571409,
|
|
"grad_norm": 0.14861073972470226,
|
|
"learning_rate": 2.823376074397633e-06,
|
|
"loss": 1.2081,
|
|
"mean_token_accuracy": 0.7078703761100769,
|
|
"num_tokens": 93648605.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.906109061090611,
|
|
"grad_norm": 0.12009488818083502,
|
|
"learning_rate": 2.819853459208116e-06,
|
|
"loss": 1.1844,
|
|
"mean_token_accuracy": 0.7139721989631653,
|
|
"num_tokens": 94532746.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 0.9074757414240809,
|
|
"grad_norm": 0.11607076199918324,
|
|
"learning_rate": 2.8163308440186e-06,
|
|
"loss": 1.2627,
|
|
"mean_token_accuracy": 0.6993822157382965,
|
|
"num_tokens": 95458447.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.9088424217575509,
|
|
"grad_norm": 0.12399445021188303,
|
|
"learning_rate": 2.8128082288290832e-06,
|
|
"loss": 1.2038,
|
|
"mean_token_accuracy": 0.7102779746055603,
|
|
"num_tokens": 96361135.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 1.23671875,
|
|
"epoch": 0.9102091020910209,
|
|
"grad_norm": 0.13094581132289573,
|
|
"learning_rate": 2.8092856136395664e-06,
|
|
"loss": 1.2374,
|
|
"mean_token_accuracy": 0.7047690629959107,
|
|
"num_tokens": 97291091.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.9115757824244909,
|
|
"grad_norm": 0.1345120602557189,
|
|
"learning_rate": 2.8057629984500495e-06,
|
|
"loss": 1.2127,
|
|
"mean_token_accuracy": 0.7067639112472535,
|
|
"num_tokens": 98257081.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.9129424627579609,
|
|
"grad_norm": 0.13157239667209203,
|
|
"learning_rate": 2.8022403832605326e-06,
|
|
"loss": 1.1873,
|
|
"mean_token_accuracy": 0.7137351334095001,
|
|
"num_tokens": 99191461.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 0.914309143091431,
|
|
"grad_norm": 0.11119904493629737,
|
|
"learning_rate": 2.7987177680710157e-06,
|
|
"loss": 1.2179,
|
|
"mean_token_accuracy": 0.7098675489425659,
|
|
"num_tokens": 100129277.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.915675823424901,
|
|
"grad_norm": 0.14302362216476552,
|
|
"learning_rate": 2.7951951528814997e-06,
|
|
"loss": 1.2398,
|
|
"mean_token_accuracy": 0.7045514285564423,
|
|
"num_tokens": 101073591.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 0.9170425037583709,
|
|
"grad_norm": 0.10970036439054749,
|
|
"learning_rate": 2.791672537691983e-06,
|
|
"loss": 1.2293,
|
|
"mean_token_accuracy": 0.7031483590602875,
|
|
"num_tokens": 102045291.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 0.9184091840918409,
|
|
"grad_norm": 0.1256381015177892,
|
|
"learning_rate": 2.788149922502466e-06,
|
|
"loss": 1.2087,
|
|
"mean_token_accuracy": 0.7112240493297577,
|
|
"num_tokens": 102947908.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.9197758644253109,
|
|
"grad_norm": 0.13127569898639338,
|
|
"learning_rate": 2.784627307312949e-06,
|
|
"loss": 1.1897,
|
|
"mean_token_accuracy": 0.7150723397731781,
|
|
"num_tokens": 103886938.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 0.9211425447587809,
|
|
"grad_norm": 0.11905529297297285,
|
|
"learning_rate": 2.7811046921234326e-06,
|
|
"loss": 1.1817,
|
|
"mean_token_accuracy": 0.7135479509830475,
|
|
"num_tokens": 104809648.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 0.922509225092251,
|
|
"grad_norm": 0.1182055876116771,
|
|
"learning_rate": 2.777582076933916e-06,
|
|
"loss": 1.2373,
|
|
"mean_token_accuracy": 0.7058520674705505,
|
|
"num_tokens": 105713318.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.923875905425721,
|
|
"grad_norm": 0.12331564012484585,
|
|
"learning_rate": 2.7740594617443993e-06,
|
|
"loss": 1.1934,
|
|
"mean_token_accuracy": 0.7114256024360657,
|
|
"num_tokens": 106658105.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 0.9252425857591909,
|
|
"grad_norm": 0.12132340620161727,
|
|
"learning_rate": 2.770536846554883e-06,
|
|
"loss": 1.1956,
|
|
"mean_token_accuracy": 0.7137125313282013,
|
|
"num_tokens": 107622892.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 0.9266092660926609,
|
|
"grad_norm": 0.11795470432671681,
|
|
"learning_rate": 2.767014231365366e-06,
|
|
"loss": 1.2468,
|
|
"mean_token_accuracy": 0.7045657813549042,
|
|
"num_tokens": 108534824.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 0.9279759464261309,
|
|
"grad_norm": 0.108219479350522,
|
|
"learning_rate": 2.763491616175849e-06,
|
|
"loss": 1.2267,
|
|
"mean_token_accuracy": 0.707587194442749,
|
|
"num_tokens": 109501934.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.9293426267596009,
|
|
"grad_norm": 0.12031058861363844,
|
|
"learning_rate": 2.759969000986332e-06,
|
|
"loss": 1.1997,
|
|
"mean_token_accuracy": 0.7110873699188233,
|
|
"num_tokens": 110441401.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 1.158203125,
|
|
"epoch": 0.930709307093071,
|
|
"grad_norm": 0.11624553904565592,
|
|
"learning_rate": 2.756446385796816e-06,
|
|
"loss": 1.1679,
|
|
"mean_token_accuracy": 0.7170976519584655,
|
|
"num_tokens": 111349029.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.932075987426541,
|
|
"grad_norm": 0.11520556182655277,
|
|
"learning_rate": 2.7529237706072993e-06,
|
|
"loss": 1.1866,
|
|
"mean_token_accuracy": 0.7129973828792572,
|
|
"num_tokens": 112282615.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 0.933442667760011,
|
|
"grad_norm": 0.11751437413209412,
|
|
"learning_rate": 2.7494011554177824e-06,
|
|
"loss": 1.1841,
|
|
"mean_token_accuracy": 0.7153954088687897,
|
|
"num_tokens": 113198887.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 1.2375,
|
|
"epoch": 0.9348093480934809,
|
|
"grad_norm": 0.13261082884063324,
|
|
"learning_rate": 2.7458785402282656e-06,
|
|
"loss": 1.2349,
|
|
"mean_token_accuracy": 0.7064491748809815,
|
|
"num_tokens": 114109013.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 0.9361760284269509,
|
|
"grad_norm": 0.11461633178186699,
|
|
"learning_rate": 2.7423559250387487e-06,
|
|
"loss": 1.2392,
|
|
"mean_token_accuracy": 0.7054702937602997,
|
|
"num_tokens": 115028034.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.9375427087604209,
|
|
"grad_norm": 0.11762251195967308,
|
|
"learning_rate": 2.7388333098492326e-06,
|
|
"loss": 1.1801,
|
|
"mean_token_accuracy": 0.7152316689491272,
|
|
"num_tokens": 115892473.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 0.938909389093891,
|
|
"grad_norm": 0.11038019825401116,
|
|
"learning_rate": 2.7353106946597158e-06,
|
|
"loss": 1.2261,
|
|
"mean_token_accuracy": 0.7051122963428498,
|
|
"num_tokens": 116864287.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 0.940276069427361,
|
|
"grad_norm": 0.11672547911066007,
|
|
"learning_rate": 2.731788079470199e-06,
|
|
"loss": 1.2431,
|
|
"mean_token_accuracy": 0.701931631565094,
|
|
"num_tokens": 117745080.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 1.22421875,
|
|
"epoch": 0.941642749760831,
|
|
"grad_norm": 0.12580098115777014,
|
|
"learning_rate": 2.728265464280682e-06,
|
|
"loss": 1.2321,
|
|
"mean_token_accuracy": 0.7063672840595245,
|
|
"num_tokens": 118647336.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 0.9430094300943009,
|
|
"grad_norm": 0.1188495382274125,
|
|
"learning_rate": 2.7247428490911656e-06,
|
|
"loss": 1.2099,
|
|
"mean_token_accuracy": 0.7104326844215393,
|
|
"num_tokens": 119534463.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.9443761104277709,
|
|
"grad_norm": 0.12449799322176075,
|
|
"learning_rate": 2.7212202339016487e-06,
|
|
"loss": 1.2267,
|
|
"mean_token_accuracy": 0.7074391543865204,
|
|
"num_tokens": 120466812.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 0.9457427907612409,
|
|
"grad_norm": 0.11095258026829394,
|
|
"learning_rate": 2.7176976187121322e-06,
|
|
"loss": 1.1947,
|
|
"mean_token_accuracy": 0.7117006599903106,
|
|
"num_tokens": 121452165.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 0.947109471094711,
|
|
"grad_norm": 0.14527691471330995,
|
|
"learning_rate": 2.7141750035226154e-06,
|
|
"loss": 1.2295,
|
|
"mean_token_accuracy": 0.7056489169597626,
|
|
"num_tokens": 122414115.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.948476151428181,
|
|
"grad_norm": 0.11955010276257833,
|
|
"learning_rate": 2.710652388333099e-06,
|
|
"loss": 1.2138,
|
|
"mean_token_accuracy": 0.7094878554344177,
|
|
"num_tokens": 123359816.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 0.949842831761651,
|
|
"grad_norm": 0.1196008365260796,
|
|
"learning_rate": 2.707129773143582e-06,
|
|
"loss": 1.2416,
|
|
"mean_token_accuracy": 0.7020051300525665,
|
|
"num_tokens": 124321188.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 0.951209512095121,
|
|
"grad_norm": 0.12138210587604782,
|
|
"learning_rate": 2.703607157954065e-06,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.713533341884613,
|
|
"num_tokens": 125261226.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.9525761924285909,
|
|
"grad_norm": 0.1263323463896496,
|
|
"learning_rate": 2.7000845427645483e-06,
|
|
"loss": 1.232,
|
|
"mean_token_accuracy": 0.7058530747890472,
|
|
"num_tokens": 126144563.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 0.9539428727620609,
|
|
"grad_norm": 0.12033418394201179,
|
|
"learning_rate": 2.6965619275750322e-06,
|
|
"loss": 1.1665,
|
|
"mean_token_accuracy": 0.7173733115196228,
|
|
"num_tokens": 127048888.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 1.27734375,
|
|
"epoch": 0.955309553095531,
|
|
"grad_norm": 0.13454184863540591,
|
|
"learning_rate": 2.6930393123855154e-06,
|
|
"loss": 1.2921,
|
|
"mean_token_accuracy": 0.6958851516246796,
|
|
"num_tokens": 127989620.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 1.2546875,
|
|
"epoch": 0.956676233429001,
|
|
"grad_norm": 0.12437579897351421,
|
|
"learning_rate": 2.6895166971959985e-06,
|
|
"loss": 1.2694,
|
|
"mean_token_accuracy": 0.6990921854972839,
|
|
"num_tokens": 128888079.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 0.958042913762471,
|
|
"grad_norm": 0.16222382726173368,
|
|
"learning_rate": 2.6859940820064816e-06,
|
|
"loss": 1.1768,
|
|
"mean_token_accuracy": 0.7161491513252258,
|
|
"num_tokens": 129792691.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 0.959409594095941,
|
|
"grad_norm": 0.14062088497188197,
|
|
"learning_rate": 2.6824714668169647e-06,
|
|
"loss": 1.2123,
|
|
"mean_token_accuracy": 0.7096220493316651,
|
|
"num_tokens": 130700169.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 0.9607762744294109,
|
|
"grad_norm": 0.1149781005460307,
|
|
"learning_rate": 2.6789488516274487e-06,
|
|
"loss": 1.2157,
|
|
"mean_token_accuracy": 0.706826651096344,
|
|
"num_tokens": 131692013.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.9621429547628809,
|
|
"grad_norm": 0.12338776014247105,
|
|
"learning_rate": 2.675426236437932e-06,
|
|
"loss": 1.2054,
|
|
"mean_token_accuracy": 0.7109057426452636,
|
|
"num_tokens": 132656161.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 0.963509635096351,
|
|
"grad_norm": 0.11207906058932651,
|
|
"learning_rate": 2.671903621248415e-06,
|
|
"loss": 1.1885,
|
|
"mean_token_accuracy": 0.7103139996528626,
|
|
"num_tokens": 133550225.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 0.964876315429821,
|
|
"grad_norm": 0.12024166573483841,
|
|
"learning_rate": 2.668381006058898e-06,
|
|
"loss": 1.2252,
|
|
"mean_token_accuracy": 0.7049888968467712,
|
|
"num_tokens": 134452831.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.966242995763291,
|
|
"grad_norm": 0.12230734134582784,
|
|
"learning_rate": 2.6648583908693816e-06,
|
|
"loss": 1.2239,
|
|
"mean_token_accuracy": 0.7070957958698273,
|
|
"num_tokens": 135370497.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 1.23828125,
|
|
"epoch": 0.967609676096761,
|
|
"grad_norm": 0.1873668832529693,
|
|
"learning_rate": 2.6613357756798648e-06,
|
|
"loss": 1.2528,
|
|
"mean_token_accuracy": 0.7010502636432647,
|
|
"num_tokens": 136290488.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.968976356430231,
|
|
"grad_norm": 0.11924175657591951,
|
|
"learning_rate": 2.6578131604903483e-06,
|
|
"loss": 1.2021,
|
|
"mean_token_accuracy": 0.7100654065608978,
|
|
"num_tokens": 137177367.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 0.9703430367637009,
|
|
"grad_norm": 0.13264300360013678,
|
|
"learning_rate": 2.6542905453008314e-06,
|
|
"loss": 1.2009,
|
|
"mean_token_accuracy": 0.7124431788921356,
|
|
"num_tokens": 138101453.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 0.971709717097171,
|
|
"grad_norm": 0.11412548283682244,
|
|
"learning_rate": 2.650767930111315e-06,
|
|
"loss": 1.2223,
|
|
"mean_token_accuracy": 0.7090808749198914,
|
|
"num_tokens": 139032711.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 0.973076397430641,
|
|
"grad_norm": 0.12583790941685022,
|
|
"learning_rate": 2.647245314921798e-06,
|
|
"loss": 1.2242,
|
|
"mean_token_accuracy": 0.7077539622783661,
|
|
"num_tokens": 139991073.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 0.974443077764111,
|
|
"grad_norm": 0.10822498613470957,
|
|
"learning_rate": 2.6437226997322812e-06,
|
|
"loss": 1.2052,
|
|
"mean_token_accuracy": 0.7103246033191681,
|
|
"num_tokens": 140960441.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 0.975809758097581,
|
|
"grad_norm": 0.11994748707187208,
|
|
"learning_rate": 2.640200084542765e-06,
|
|
"loss": 1.1933,
|
|
"mean_token_accuracy": 0.7102828741073608,
|
|
"num_tokens": 141897029.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 0.977176438431051,
|
|
"grad_norm": 0.11632102337267018,
|
|
"learning_rate": 2.6366774693532483e-06,
|
|
"loss": 1.2046,
|
|
"mean_token_accuracy": 0.7108843982219696,
|
|
"num_tokens": 142750817.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 0.9785431187645209,
|
|
"grad_norm": 0.13207700704079778,
|
|
"learning_rate": 2.6331548541637314e-06,
|
|
"loss": 1.2442,
|
|
"mean_token_accuracy": 0.7046360909938812,
|
|
"num_tokens": 143670880.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 0.979909799097991,
|
|
"grad_norm": 0.11009203568641548,
|
|
"learning_rate": 2.6296322389742146e-06,
|
|
"loss": 1.1842,
|
|
"mean_token_accuracy": 0.7138502657413482,
|
|
"num_tokens": 144603655.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 0.981276479431461,
|
|
"grad_norm": 0.11956392747740925,
|
|
"learning_rate": 2.6261096237846977e-06,
|
|
"loss": 1.1735,
|
|
"mean_token_accuracy": 0.7166625320911407,
|
|
"num_tokens": 145514400.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 0.982643159764931,
|
|
"grad_norm": 0.1354796344153864,
|
|
"learning_rate": 2.622587008595181e-06,
|
|
"loss": 1.1894,
|
|
"mean_token_accuracy": 0.7142235159873962,
|
|
"num_tokens": 146414141.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 0.984009840098401,
|
|
"grad_norm": 0.13642563991962947,
|
|
"learning_rate": 2.6190643934056648e-06,
|
|
"loss": 1.229,
|
|
"mean_token_accuracy": 0.7047328293323517,
|
|
"num_tokens": 147347453.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 0.985376520431871,
|
|
"grad_norm": 0.13446073197943947,
|
|
"learning_rate": 2.615541778216148e-06,
|
|
"loss": 1.2561,
|
|
"mean_token_accuracy": 0.7013183295726776,
|
|
"num_tokens": 148247017.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 0.9867432007653409,
|
|
"grad_norm": 0.11864773262353144,
|
|
"learning_rate": 2.612019163026631e-06,
|
|
"loss": 1.2039,
|
|
"mean_token_accuracy": 0.7115210771560669,
|
|
"num_tokens": 149151212.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 1.26953125,
|
|
"epoch": 0.988109881098811,
|
|
"grad_norm": 0.12043252390209125,
|
|
"learning_rate": 2.6084965478371146e-06,
|
|
"loss": 1.2822,
|
|
"mean_token_accuracy": 0.6980770766735077,
|
|
"num_tokens": 150076593.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 0.989476561432281,
|
|
"grad_norm": 0.14440881489038315,
|
|
"learning_rate": 2.6049739326475977e-06,
|
|
"loss": 1.1928,
|
|
"mean_token_accuracy": 0.7126754462718964,
|
|
"num_tokens": 151000729.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 0.990843241765751,
|
|
"grad_norm": 0.14068732454239577,
|
|
"learning_rate": 2.6014513174580812e-06,
|
|
"loss": 1.2214,
|
|
"mean_token_accuracy": 0.708335953950882,
|
|
"num_tokens": 151894468.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 0.992209922099221,
|
|
"grad_norm": 0.11322690422816707,
|
|
"learning_rate": 2.5979287022685644e-06,
|
|
"loss": 1.1924,
|
|
"mean_token_accuracy": 0.7138138890266419,
|
|
"num_tokens": 152787250.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 0.993576602432691,
|
|
"grad_norm": 0.13911191899858039,
|
|
"learning_rate": 2.594406087079048e-06,
|
|
"loss": 1.1718,
|
|
"mean_token_accuracy": 0.7152155220508576,
|
|
"num_tokens": 153694052.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 0.994943282766161,
|
|
"grad_norm": 0.11642929808055694,
|
|
"learning_rate": 2.590883471889531e-06,
|
|
"loss": 1.2503,
|
|
"mean_token_accuracy": 0.7018868923187256,
|
|
"num_tokens": 154602625.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 0.996309963099631,
|
|
"grad_norm": 0.16287285118445177,
|
|
"learning_rate": 2.587360856700014e-06,
|
|
"loss": 1.1975,
|
|
"mean_token_accuracy": 0.7111340343952179,
|
|
"num_tokens": 155507476.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 0.997676643433101,
|
|
"grad_norm": 0.13156889418708104,
|
|
"learning_rate": 2.5838382415104973e-06,
|
|
"loss": 1.2089,
|
|
"mean_token_accuracy": 0.7086274147033691,
|
|
"num_tokens": 156430565.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 1.240625,
|
|
"epoch": 0.999043323766571,
|
|
"grad_norm": 0.11134626159887877,
|
|
"learning_rate": 2.5803156263209813e-06,
|
|
"loss": 1.2553,
|
|
"mean_token_accuracy": 0.6986011385917663,
|
|
"num_tokens": 157327232.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 1.24375,
|
|
"epoch": 1.000410004100041,
|
|
"grad_norm": 0.14072741296398159,
|
|
"learning_rate": 2.5767930111314644e-06,
|
|
"loss": 1.2452,
|
|
"mean_token_accuracy": 0.7032181978225708,
|
|
"num_tokens": 158288028.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.001776684433511,
|
|
"grad_norm": 0.12306609659957422,
|
|
"learning_rate": 2.5732703959419475e-06,
|
|
"loss": 1.1885,
|
|
"mean_token_accuracy": 0.712241530418396,
|
|
"num_tokens": 159177595.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.003143364766981,
|
|
"grad_norm": 0.1134901081899675,
|
|
"learning_rate": 2.5697477807524306e-06,
|
|
"loss": 1.2008,
|
|
"mean_token_accuracy": 0.7087075173854828,
|
|
"num_tokens": 160162002.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.004510045100451,
|
|
"grad_norm": 0.1224544485499782,
|
|
"learning_rate": 2.5662251655629138e-06,
|
|
"loss": 1.1556,
|
|
"mean_token_accuracy": 0.7189548432826995,
|
|
"num_tokens": 161083842.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 1.005876725433921,
|
|
"grad_norm": 0.1713475946056354,
|
|
"learning_rate": 2.5627025503733977e-06,
|
|
"loss": 1.2452,
|
|
"mean_token_accuracy": 0.701144540309906,
|
|
"num_tokens": 162012486.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 1.007243405767391,
|
|
"grad_norm": 0.11456443455699516,
|
|
"learning_rate": 2.559179935183881e-06,
|
|
"loss": 1.2239,
|
|
"mean_token_accuracy": 0.7075781881809234,
|
|
"num_tokens": 162916191.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 1.0086100861008611,
|
|
"grad_norm": 0.1315509621324793,
|
|
"learning_rate": 2.555657319994364e-06,
|
|
"loss": 1.2423,
|
|
"mean_token_accuracy": 0.705412095785141,
|
|
"num_tokens": 163836150.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 1.009976766434331,
|
|
"grad_norm": 0.1375416507241031,
|
|
"learning_rate": 2.552134704804847e-06,
|
|
"loss": 1.2113,
|
|
"mean_token_accuracy": 0.7073660254478454,
|
|
"num_tokens": 164798025.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 1.011343446767801,
|
|
"grad_norm": 0.13871408608892885,
|
|
"learning_rate": 2.5486120896153306e-06,
|
|
"loss": 1.2673,
|
|
"mean_token_accuracy": 0.6966290950775147,
|
|
"num_tokens": 165699866.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.012710127101271,
|
|
"grad_norm": 0.1392256019594753,
|
|
"learning_rate": 2.5450894744258138e-06,
|
|
"loss": 1.2042,
|
|
"mean_token_accuracy": 0.7121475994586944,
|
|
"num_tokens": 166623537.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.014076807434741,
|
|
"grad_norm": 0.20131562580084647,
|
|
"learning_rate": 2.5415668592362973e-06,
|
|
"loss": 1.1613,
|
|
"mean_token_accuracy": 0.7205895781517029,
|
|
"num_tokens": 167502701.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 1.015443487768211,
|
|
"grad_norm": 0.11912589522425475,
|
|
"learning_rate": 2.5380442440467804e-06,
|
|
"loss": 1.1973,
|
|
"mean_token_accuracy": 0.7143389105796814,
|
|
"num_tokens": 168443102.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.016810168101681,
|
|
"grad_norm": 0.12547634181588338,
|
|
"learning_rate": 2.534521628857264e-06,
|
|
"loss": 1.2294,
|
|
"mean_token_accuracy": 0.7039758265018463,
|
|
"num_tokens": 169357386.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.018176848435151,
|
|
"grad_norm": 0.10762065321192509,
|
|
"learning_rate": 2.530999013667747e-06,
|
|
"loss": 1.209,
|
|
"mean_token_accuracy": 0.7097684681415558,
|
|
"num_tokens": 170257004.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 1.29609375,
|
|
"epoch": 1.019543528768621,
|
|
"grad_norm": 0.12261361026890207,
|
|
"learning_rate": 2.5274763984782302e-06,
|
|
"loss": 1.2915,
|
|
"mean_token_accuracy": 0.6962465524673462,
|
|
"num_tokens": 171177938.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.020910209102091,
|
|
"grad_norm": 0.1843046178820221,
|
|
"learning_rate": 2.5239537832887138e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7169839560985565,
|
|
"num_tokens": 172138150.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.022276889435561,
|
|
"grad_norm": 0.15560460651994656,
|
|
"learning_rate": 2.5204311680991973e-06,
|
|
"loss": 1.204,
|
|
"mean_token_accuracy": 0.7082561016082763,
|
|
"num_tokens": 173066623.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.023643569769031,
|
|
"grad_norm": 0.10609553896704066,
|
|
"learning_rate": 2.5169085529096804e-06,
|
|
"loss": 1.164,
|
|
"mean_token_accuracy": 0.7180382251739502,
|
|
"num_tokens": 173922054.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.0250102501025011,
|
|
"grad_norm": 0.12487133298050056,
|
|
"learning_rate": 2.5133859377201636e-06,
|
|
"loss": 1.2197,
|
|
"mean_token_accuracy": 0.7042903661727905,
|
|
"num_tokens": 174837815.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 1.0263769304359711,
|
|
"grad_norm": 0.16894475670108466,
|
|
"learning_rate": 2.5098633225306467e-06,
|
|
"loss": 1.2587,
|
|
"mean_token_accuracy": 0.7009473621845246,
|
|
"num_tokens": 175768726.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.027743610769441,
|
|
"grad_norm": 0.1276932762873247,
|
|
"learning_rate": 2.50634070734113e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7151334762573243,
|
|
"num_tokens": 176702312.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 1.029110291102911,
|
|
"grad_norm": 0.12111887169507803,
|
|
"learning_rate": 2.502818092151614e-06,
|
|
"loss": 1.1984,
|
|
"mean_token_accuracy": 0.7120105862617493,
|
|
"num_tokens": 177604059.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.030476971436381,
|
|
"grad_norm": 0.12681227482735105,
|
|
"learning_rate": 2.499295476962097e-06,
|
|
"loss": 1.1944,
|
|
"mean_token_accuracy": 0.7113577008247376,
|
|
"num_tokens": 178530454.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 1.24140625,
|
|
"epoch": 1.031843651769851,
|
|
"grad_norm": 0.13133821696242312,
|
|
"learning_rate": 2.49577286177258e-06,
|
|
"loss": 1.2472,
|
|
"mean_token_accuracy": 0.7060240387916565,
|
|
"num_tokens": 179483434.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 1.033210332103321,
|
|
"grad_norm": 0.13079747557014498,
|
|
"learning_rate": 2.492250246583063e-06,
|
|
"loss": 1.2121,
|
|
"mean_token_accuracy": 0.7091143250465393,
|
|
"num_tokens": 180391558.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 1.034577012436791,
|
|
"grad_norm": 0.11281717527012466,
|
|
"learning_rate": 2.4887276313935467e-06,
|
|
"loss": 1.1996,
|
|
"mean_token_accuracy": 0.7108489573001862,
|
|
"num_tokens": 181295380.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.035943692770261,
|
|
"grad_norm": 0.14437610408188967,
|
|
"learning_rate": 2.48520501620403e-06,
|
|
"loss": 1.1865,
|
|
"mean_token_accuracy": 0.7111780166625976,
|
|
"num_tokens": 182239138.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.037310373103731,
|
|
"grad_norm": 0.1245412665964164,
|
|
"learning_rate": 2.4816824010145134e-06,
|
|
"loss": 1.1952,
|
|
"mean_token_accuracy": 0.7099207282066345,
|
|
"num_tokens": 183141978.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.038677053437201,
|
|
"grad_norm": 0.12482636671958454,
|
|
"learning_rate": 2.478159785824997e-06,
|
|
"loss": 1.2153,
|
|
"mean_token_accuracy": 0.7087715625762939,
|
|
"num_tokens": 184121466.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 1.040043733770671,
|
|
"grad_norm": 0.1273143123340155,
|
|
"learning_rate": 2.47463717063548e-06,
|
|
"loss": 1.2255,
|
|
"mean_token_accuracy": 0.7076945781707764,
|
|
"num_tokens": 185051374.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 1.25234375,
|
|
"epoch": 1.0414104141041411,
|
|
"grad_norm": 0.2729094197851763,
|
|
"learning_rate": 2.4711145554459636e-06,
|
|
"loss": 1.2545,
|
|
"mean_token_accuracy": 0.7016341030597687,
|
|
"num_tokens": 186003742.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 1.0427770944376111,
|
|
"grad_norm": 0.11921855963625891,
|
|
"learning_rate": 2.4675919402564467e-06,
|
|
"loss": 1.2384,
|
|
"mean_token_accuracy": 0.7035252571105957,
|
|
"num_tokens": 186925258.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.0441437747710811,
|
|
"grad_norm": 0.12788578018470742,
|
|
"learning_rate": 2.46406932506693e-06,
|
|
"loss": 1.2223,
|
|
"mean_token_accuracy": 0.7075823724269867,
|
|
"num_tokens": 187879783.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.045510455104551,
|
|
"grad_norm": 0.12016525365109938,
|
|
"learning_rate": 2.4605467098774134e-06,
|
|
"loss": 1.2027,
|
|
"mean_token_accuracy": 0.7113193869590759,
|
|
"num_tokens": 188843845.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.046877135438021,
|
|
"grad_norm": 0.11660234201425848,
|
|
"learning_rate": 2.4570240946878965e-06,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.7119068324565887,
|
|
"num_tokens": 189758326.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 1.048243815771491,
|
|
"grad_norm": 0.1280280599825291,
|
|
"learning_rate": 2.4535014794983796e-06,
|
|
"loss": 1.256,
|
|
"mean_token_accuracy": 0.7020646035671234,
|
|
"num_tokens": 190673197.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.049610496104961,
|
|
"grad_norm": 0.12597947446102997,
|
|
"learning_rate": 2.449978864308863e-06,
|
|
"loss": 1.1796,
|
|
"mean_token_accuracy": 0.7160924792289733,
|
|
"num_tokens": 191557879.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.050977176438431,
|
|
"grad_norm": 0.12490146895603235,
|
|
"learning_rate": 2.4464562491193463e-06,
|
|
"loss": 1.1938,
|
|
"mean_token_accuracy": 0.7118254601955414,
|
|
"num_tokens": 192452839.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 1.2203125,
|
|
"epoch": 1.052343856771901,
|
|
"grad_norm": 0.12883306791205515,
|
|
"learning_rate": 2.4429336339298294e-06,
|
|
"loss": 1.2383,
|
|
"mean_token_accuracy": 0.7050466954708099,
|
|
"num_tokens": 193398464.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.053710537105371,
|
|
"grad_norm": 0.115798687328715,
|
|
"learning_rate": 2.439411018740313e-06,
|
|
"loss": 1.1359,
|
|
"mean_token_accuracy": 0.7199380576610566,
|
|
"num_tokens": 194283184.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.055077217438841,
|
|
"grad_norm": 0.11783894721197312,
|
|
"learning_rate": 2.435888403550796e-06,
|
|
"loss": 1.1605,
|
|
"mean_token_accuracy": 0.7169598639011383,
|
|
"num_tokens": 195180241.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.056443897772311,
|
|
"grad_norm": 0.15512689701515295,
|
|
"learning_rate": 2.4323657883612797e-06,
|
|
"loss": 1.2185,
|
|
"mean_token_accuracy": 0.7083754956722259,
|
|
"num_tokens": 196112533.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.0578105781057812,
|
|
"grad_norm": 0.1360668782929442,
|
|
"learning_rate": 2.4288431731717628e-06,
|
|
"loss": 1.216,
|
|
"mean_token_accuracy": 0.7091945588588715,
|
|
"num_tokens": 197002409.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 1.0591772584392511,
|
|
"grad_norm": 0.1458658100478793,
|
|
"learning_rate": 2.4253205579822463e-06,
|
|
"loss": 1.2062,
|
|
"mean_token_accuracy": 0.7082434952259063,
|
|
"num_tokens": 197898982.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.0605439387727211,
|
|
"grad_norm": 0.1428200374015133,
|
|
"learning_rate": 2.4217979427927294e-06,
|
|
"loss": 1.1665,
|
|
"mean_token_accuracy": 0.7185484230518341,
|
|
"num_tokens": 198790555.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.0619106191061911,
|
|
"grad_norm": 0.13986420994368554,
|
|
"learning_rate": 2.418275327603213e-06,
|
|
"loss": 1.1592,
|
|
"mean_token_accuracy": 0.7185302138328552,
|
|
"num_tokens": 199693946.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.063277299439661,
|
|
"grad_norm": 0.14886403015377064,
|
|
"learning_rate": 2.414752712413696e-06,
|
|
"loss": 1.1821,
|
|
"mean_token_accuracy": 0.7135333955287934,
|
|
"num_tokens": 200614341.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 1.145703125,
|
|
"epoch": 1.064643979773131,
|
|
"grad_norm": 0.11869818583919854,
|
|
"learning_rate": 2.4112300972241797e-06,
|
|
"loss": 1.1481,
|
|
"mean_token_accuracy": 0.7224089741706848,
|
|
"num_tokens": 201522326.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.066010660106601,
|
|
"grad_norm": 0.13918536212481206,
|
|
"learning_rate": 2.4077074820346628e-06,
|
|
"loss": 1.2141,
|
|
"mean_token_accuracy": 0.7091760993003845,
|
|
"num_tokens": 202451122.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 1.067377340440071,
|
|
"grad_norm": 0.12897950907071545,
|
|
"learning_rate": 2.404184866845146e-06,
|
|
"loss": 1.2351,
|
|
"mean_token_accuracy": 0.7024719178676605,
|
|
"num_tokens": 203398317.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 1.068744020773541,
|
|
"grad_norm": 0.22367861988830376,
|
|
"learning_rate": 2.4006622516556295e-06,
|
|
"loss": 1.2184,
|
|
"mean_token_accuracy": 0.7077607154846192,
|
|
"num_tokens": 204324245.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 1.070110701107011,
|
|
"grad_norm": 0.11980990396044089,
|
|
"learning_rate": 2.3971396364661126e-06,
|
|
"loss": 1.2457,
|
|
"mean_token_accuracy": 0.7041300535202026,
|
|
"num_tokens": 205235085.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 1.071477381440481,
|
|
"grad_norm": 0.12787065222286223,
|
|
"learning_rate": 2.393617021276596e-06,
|
|
"loss": 1.2296,
|
|
"mean_token_accuracy": 0.7068292319774627,
|
|
"num_tokens": 206145262.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 1.072844061773951,
|
|
"grad_norm": 0.11253823877245761,
|
|
"learning_rate": 2.3900944060870793e-06,
|
|
"loss": 1.2261,
|
|
"mean_token_accuracy": 0.7091174483299255,
|
|
"num_tokens": 207036229.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.0742107421074212,
|
|
"grad_norm": 0.11702434204528873,
|
|
"learning_rate": 2.3865717908975624e-06,
|
|
"loss": 1.2013,
|
|
"mean_token_accuracy": 0.7097544252872467,
|
|
"num_tokens": 207970868.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.0755774224408912,
|
|
"grad_norm": 0.12812957595785932,
|
|
"learning_rate": 2.383049175708046e-06,
|
|
"loss": 1.2234,
|
|
"mean_token_accuracy": 0.7066633462905884,
|
|
"num_tokens": 208888583.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.0769441027743611,
|
|
"grad_norm": 0.13774957504402194,
|
|
"learning_rate": 2.379526560518529e-06,
|
|
"loss": 1.1839,
|
|
"mean_token_accuracy": 0.7131855547428131,
|
|
"num_tokens": 209794977.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 1.0783107831078311,
|
|
"grad_norm": 0.1223677897785382,
|
|
"learning_rate": 2.376003945329012e-06,
|
|
"loss": 1.2348,
|
|
"mean_token_accuracy": 0.703465461730957,
|
|
"num_tokens": 210734920.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.079677463441301,
|
|
"grad_norm": 0.12667568149794184,
|
|
"learning_rate": 2.3724813301394957e-06,
|
|
"loss": 1.1524,
|
|
"mean_token_accuracy": 0.7196262419223786,
|
|
"num_tokens": 211626863.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.081044143774771,
|
|
"grad_norm": 0.10799812037432739,
|
|
"learning_rate": 2.368958714949979e-06,
|
|
"loss": 1.1559,
|
|
"mean_token_accuracy": 0.7196785926818847,
|
|
"num_tokens": 212511243.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.082410824108241,
|
|
"grad_norm": 0.13843078294782488,
|
|
"learning_rate": 2.3654360997604624e-06,
|
|
"loss": 1.1692,
|
|
"mean_token_accuracy": 0.7172195613384247,
|
|
"num_tokens": 213413294.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.083777504441711,
|
|
"grad_norm": 0.11625948231647545,
|
|
"learning_rate": 2.3619134845709455e-06,
|
|
"loss": 1.2243,
|
|
"mean_token_accuracy": 0.7082398533821106,
|
|
"num_tokens": 214360740.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 1.209375,
|
|
"epoch": 1.085144184775181,
|
|
"grad_norm": 0.12012919917363085,
|
|
"learning_rate": 2.358390869381429e-06,
|
|
"loss": 1.2073,
|
|
"mean_token_accuracy": 0.7114357829093934,
|
|
"num_tokens": 215276360.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.086510865108651,
|
|
"grad_norm": 0.11393441283226119,
|
|
"learning_rate": 2.354868254191912e-06,
|
|
"loss": 1.2327,
|
|
"mean_token_accuracy": 0.7054458260536194,
|
|
"num_tokens": 216217823.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.087877545442121,
|
|
"grad_norm": 0.14126409049621477,
|
|
"learning_rate": 2.3513456390023957e-06,
|
|
"loss": 1.1968,
|
|
"mean_token_accuracy": 0.7126922488212586,
|
|
"num_tokens": 217151289.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.089244225775591,
|
|
"grad_norm": 0.1298146245035236,
|
|
"learning_rate": 2.347823023812879e-06,
|
|
"loss": 1.2399,
|
|
"mean_token_accuracy": 0.7037797689437866,
|
|
"num_tokens": 218037712.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 1.090610906109061,
|
|
"grad_norm": 0.11939328407252794,
|
|
"learning_rate": 2.3443004086233624e-06,
|
|
"loss": 1.2358,
|
|
"mean_token_accuracy": 0.7055944681167603,
|
|
"num_tokens": 218942500.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.0919775864425312,
|
|
"grad_norm": 0.13332663467853656,
|
|
"learning_rate": 2.3407777934338455e-06,
|
|
"loss": 1.2139,
|
|
"mean_token_accuracy": 0.7124211668968201,
|
|
"num_tokens": 219874280.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.0933442667760012,
|
|
"grad_norm": 0.12672968997670897,
|
|
"learning_rate": 2.3372551782443286e-06,
|
|
"loss": 1.186,
|
|
"mean_token_accuracy": 0.7138333559036255,
|
|
"num_tokens": 220786434.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 1.22265625,
|
|
"epoch": 1.0947109471094711,
|
|
"grad_norm": 0.12515307755636682,
|
|
"learning_rate": 2.333732563054812e-06,
|
|
"loss": 1.2308,
|
|
"mean_token_accuracy": 0.7061972916126251,
|
|
"num_tokens": 221731563.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 1.0960776274429411,
|
|
"grad_norm": 0.1445819222730412,
|
|
"learning_rate": 2.3302099478652953e-06,
|
|
"loss": 1.2353,
|
|
"mean_token_accuracy": 0.7066518187522888,
|
|
"num_tokens": 222661240.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.097444307776411,
|
|
"grad_norm": 0.12197280250995095,
|
|
"learning_rate": 2.3266873326757784e-06,
|
|
"loss": 1.207,
|
|
"mean_token_accuracy": 0.7109593689441681,
|
|
"num_tokens": 223545127.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 1.27890625,
|
|
"epoch": 1.098810988109881,
|
|
"grad_norm": 0.1239023864607689,
|
|
"learning_rate": 2.323164717486262e-06,
|
|
"loss": 1.2851,
|
|
"mean_token_accuracy": 0.692325747013092,
|
|
"num_tokens": 224492507.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 1.152734375,
|
|
"epoch": 1.100177668443351,
|
|
"grad_norm": 0.1189005582218414,
|
|
"learning_rate": 2.319642102296745e-06,
|
|
"loss": 1.1552,
|
|
"mean_token_accuracy": 0.7219026923179627,
|
|
"num_tokens": 225418039.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.101544348776821,
|
|
"grad_norm": 0.14149620113830674,
|
|
"learning_rate": 2.3161194871072287e-06,
|
|
"loss": 1.1957,
|
|
"mean_token_accuracy": 0.7099257946014405,
|
|
"num_tokens": 226328240.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.102911029110291,
|
|
"grad_norm": 0.11291708621997201,
|
|
"learning_rate": 2.3125968719177118e-06,
|
|
"loss": 1.1827,
|
|
"mean_token_accuracy": 0.716242915391922,
|
|
"num_tokens": 227227962.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 1.25,
|
|
"epoch": 1.104277709443761,
|
|
"grad_norm": 0.14870258481677728,
|
|
"learning_rate": 2.3090742567281953e-06,
|
|
"loss": 1.2462,
|
|
"mean_token_accuracy": 0.7028556644916535,
|
|
"num_tokens": 228184716.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.105644389777231,
|
|
"grad_norm": 0.10848982957396221,
|
|
"learning_rate": 2.3055516415386785e-06,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.7145794749259948,
|
|
"num_tokens": 229091240.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 1.1070110701107012,
|
|
"grad_norm": 0.15763275209698233,
|
|
"learning_rate": 2.302029026349162e-06,
|
|
"loss": 1.1981,
|
|
"mean_token_accuracy": 0.7131851732730865,
|
|
"num_tokens": 229996026.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 1.1083777504441712,
|
|
"grad_norm": 0.13148719531263145,
|
|
"learning_rate": 2.298506411159645e-06,
|
|
"loss": 1.2226,
|
|
"mean_token_accuracy": 0.7067600190639496,
|
|
"num_tokens": 230939012.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.1097444307776412,
|
|
"grad_norm": 0.15527048064543592,
|
|
"learning_rate": 2.2949837959701287e-06,
|
|
"loss": 1.1971,
|
|
"mean_token_accuracy": 0.7105938374996186,
|
|
"num_tokens": 231929912.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.1277617785838064,
|
|
"learning_rate": 2.291461180780612e-06,
|
|
"loss": 1.2191,
|
|
"mean_token_accuracy": 0.7088849425315857,
|
|
"num_tokens": 232883978.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.1124777914445811,
|
|
"grad_norm": 0.15138585252859826,
|
|
"learning_rate": 2.287938565591095e-06,
|
|
"loss": 1.1989,
|
|
"mean_token_accuracy": 0.7106039583683014,
|
|
"num_tokens": 233808725.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.1138444717780511,
|
|
"grad_norm": 0.10578067247465653,
|
|
"learning_rate": 2.2844159504015785e-06,
|
|
"loss": 1.1668,
|
|
"mean_token_accuracy": 0.7173607170581817,
|
|
"num_tokens": 234754012.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 1.115211152111521,
|
|
"grad_norm": 0.15874429397460724,
|
|
"learning_rate": 2.2808933352120616e-06,
|
|
"loss": 1.2135,
|
|
"mean_token_accuracy": 0.7102478265762329,
|
|
"num_tokens": 235622636.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 1.116577832444991,
|
|
"grad_norm": 0.11735135025244955,
|
|
"learning_rate": 2.277370720022545e-06,
|
|
"loss": 1.2144,
|
|
"mean_token_accuracy": 0.7078297913074494,
|
|
"num_tokens": 236535697.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.117944512778461,
|
|
"grad_norm": 0.11656219647297071,
|
|
"learning_rate": 2.2738481048330283e-06,
|
|
"loss": 1.2,
|
|
"mean_token_accuracy": 0.7103517591953278,
|
|
"num_tokens": 237475768.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.119311193111931,
|
|
"grad_norm": 0.119393227046756,
|
|
"learning_rate": 2.2703254896435114e-06,
|
|
"loss": 1.1938,
|
|
"mean_token_accuracy": 0.7117473900318145,
|
|
"num_tokens": 238432335.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.120677873445401,
|
|
"grad_norm": 0.12259413500072372,
|
|
"learning_rate": 2.266802874453995e-06,
|
|
"loss": 1.2246,
|
|
"mean_token_accuracy": 0.7063193023204803,
|
|
"num_tokens": 239369741.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.122044553778871,
|
|
"grad_norm": 0.11723547405344298,
|
|
"learning_rate": 2.263280259264478e-06,
|
|
"loss": 1.1932,
|
|
"mean_token_accuracy": 0.7133415699005127,
|
|
"num_tokens": 240272289.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.123411234112341,
|
|
"grad_norm": 0.11878901618073714,
|
|
"learning_rate": 2.259757644074961e-06,
|
|
"loss": 1.1587,
|
|
"mean_token_accuracy": 0.7189637303352356,
|
|
"num_tokens": 241173620.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.1247779144458112,
|
|
"grad_norm": 0.12794107608304214,
|
|
"learning_rate": 2.2562350288854447e-06,
|
|
"loss": 1.2151,
|
|
"mean_token_accuracy": 0.7093152165412903,
|
|
"num_tokens": 242097678.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.1261445947792812,
|
|
"grad_norm": 0.14730677958012373,
|
|
"learning_rate": 2.252712413695928e-06,
|
|
"loss": 1.1687,
|
|
"mean_token_accuracy": 0.7164739787578582,
|
|
"num_tokens": 242983654.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 1.1275112751127512,
|
|
"grad_norm": 0.11877963951999347,
|
|
"learning_rate": 2.2491897985064114e-06,
|
|
"loss": 1.2154,
|
|
"mean_token_accuracy": 0.7091340541839599,
|
|
"num_tokens": 243925669.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.1288779554462212,
|
|
"grad_norm": 0.13743127535529445,
|
|
"learning_rate": 2.2456671833168945e-06,
|
|
"loss": 1.1979,
|
|
"mean_token_accuracy": 0.711557674407959,
|
|
"num_tokens": 244811053.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.1302446357796911,
|
|
"grad_norm": 0.12366754884321973,
|
|
"learning_rate": 2.242144568127378e-06,
|
|
"loss": 1.1947,
|
|
"mean_token_accuracy": 0.7120516538619995,
|
|
"num_tokens": 245703259.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 1.24921875,
|
|
"epoch": 1.1316113161131611,
|
|
"grad_norm": 0.1058098047621804,
|
|
"learning_rate": 2.238621952937861e-06,
|
|
"loss": 1.2591,
|
|
"mean_token_accuracy": 0.7026559233665466,
|
|
"num_tokens": 246658155.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 1.132977996446631,
|
|
"grad_norm": 0.1309166462005893,
|
|
"learning_rate": 2.2350993377483447e-06,
|
|
"loss": 1.2415,
|
|
"mean_token_accuracy": 0.7044022262096405,
|
|
"num_tokens": 247571180.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.134344676780101,
|
|
"grad_norm": 0.13104963705940764,
|
|
"learning_rate": 2.231576722558828e-06,
|
|
"loss": 1.1936,
|
|
"mean_token_accuracy": 0.7127008080482483,
|
|
"num_tokens": 248475074.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.135711357113571,
|
|
"grad_norm": 0.1225985878801414,
|
|
"learning_rate": 2.2280541073693114e-06,
|
|
"loss": 1.1962,
|
|
"mean_token_accuracy": 0.7121625602245331,
|
|
"num_tokens": 249406307.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.137078037447041,
|
|
"grad_norm": 0.13323975650518805,
|
|
"learning_rate": 2.2245314921797945e-06,
|
|
"loss": 1.2405,
|
|
"mean_token_accuracy": 0.7033373177051544,
|
|
"num_tokens": 250336937.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 1.1384447177805113,
|
|
"grad_norm": 0.12195976374626233,
|
|
"learning_rate": 2.2210088769902777e-06,
|
|
"loss": 1.2217,
|
|
"mean_token_accuracy": 0.7081094801425933,
|
|
"num_tokens": 251277347.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 1.09609375,
|
|
"epoch": 1.1398113981139812,
|
|
"grad_norm": 0.13691389146248903,
|
|
"learning_rate": 2.217486261800761e-06,
|
|
"loss": 1.1005,
|
|
"mean_token_accuracy": 0.7292150914669037,
|
|
"num_tokens": 252142631.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.1411780784474512,
|
|
"grad_norm": 0.11993398114386948,
|
|
"learning_rate": 2.2139636466112443e-06,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.710022222995758,
|
|
"num_tokens": 253096610.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.1425447587809212,
|
|
"grad_norm": 0.12311270457067999,
|
|
"learning_rate": 2.2104410314217275e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7156462967395782,
|
|
"num_tokens": 254006213.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 1.1439114391143912,
|
|
"grad_norm": 0.13272127771925227,
|
|
"learning_rate": 2.206918416232211e-06,
|
|
"loss": 1.2519,
|
|
"mean_token_accuracy": 0.7013237774372101,
|
|
"num_tokens": 254953818.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 1.23359375,
|
|
"epoch": 1.1452781194478612,
|
|
"grad_norm": 0.1172051118223852,
|
|
"learning_rate": 2.203395801042694e-06,
|
|
"loss": 1.2313,
|
|
"mean_token_accuracy": 0.7028139889240265,
|
|
"num_tokens": 255894131.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.1466447997813312,
|
|
"grad_norm": 0.12668024466027475,
|
|
"learning_rate": 2.1998731858531777e-06,
|
|
"loss": 1.1988,
|
|
"mean_token_accuracy": 0.7112480282783509,
|
|
"num_tokens": 256863827.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.1480114801148011,
|
|
"grad_norm": 0.1199349606046203,
|
|
"learning_rate": 2.196350570663661e-06,
|
|
"loss": 1.2312,
|
|
"mean_token_accuracy": 0.7071394681930542,
|
|
"num_tokens": 257792102.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.1493781604482711,
|
|
"grad_norm": 0.12788602081064246,
|
|
"learning_rate": 2.192827955474144e-06,
|
|
"loss": 1.2038,
|
|
"mean_token_accuracy": 0.7101231217384338,
|
|
"num_tokens": 258691796.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.150744840781741,
|
|
"grad_norm": 0.11476309745529288,
|
|
"learning_rate": 2.1893053402846275e-06,
|
|
"loss": 1.1734,
|
|
"mean_token_accuracy": 0.7183996498584747,
|
|
"num_tokens": 259615283.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 1.24609375,
|
|
"epoch": 1.152111521115211,
|
|
"grad_norm": 0.11124853015881056,
|
|
"learning_rate": 2.1857827250951106e-06,
|
|
"loss": 1.2536,
|
|
"mean_token_accuracy": 0.7047743618488311,
|
|
"num_tokens": 260564975.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 1.22578125,
|
|
"epoch": 1.153478201448681,
|
|
"grad_norm": 0.1659569100089984,
|
|
"learning_rate": 2.182260109905594e-06,
|
|
"loss": 1.2353,
|
|
"mean_token_accuracy": 0.7027330994606018,
|
|
"num_tokens": 261489343.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.154844881782151,
|
|
"grad_norm": 0.1442726751480761,
|
|
"learning_rate": 2.1787374947160777e-06,
|
|
"loss": 1.1909,
|
|
"mean_token_accuracy": 0.7160134255886078,
|
|
"num_tokens": 262386937.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.156211562115621,
|
|
"grad_norm": 0.11814350519985048,
|
|
"learning_rate": 2.175214879526561e-06,
|
|
"loss": 1.177,
|
|
"mean_token_accuracy": 0.7149736285209656,
|
|
"num_tokens": 263346814.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 1.1575782424490912,
|
|
"grad_norm": 0.11628383756508304,
|
|
"learning_rate": 2.171692264337044e-06,
|
|
"loss": 1.2093,
|
|
"mean_token_accuracy": 0.710291463136673,
|
|
"num_tokens": 264250592.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.1589449227825612,
|
|
"grad_norm": 0.13092456783998438,
|
|
"learning_rate": 2.1681696491475275e-06,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.7156914591789245,
|
|
"num_tokens": 265145979.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.1603116031160312,
|
|
"grad_norm": 0.116752006721421,
|
|
"learning_rate": 2.1646470339580106e-06,
|
|
"loss": 1.2045,
|
|
"mean_token_accuracy": 0.7112696290016174,
|
|
"num_tokens": 266123353.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.1616782834495012,
|
|
"grad_norm": 0.1550206256465822,
|
|
"learning_rate": 2.1611244187684937e-06,
|
|
"loss": 1.1655,
|
|
"mean_token_accuracy": 0.7199881792068481,
|
|
"num_tokens": 267029186.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.1630449637829712,
|
|
"grad_norm": 0.12213061465920991,
|
|
"learning_rate": 2.1576018035789773e-06,
|
|
"loss": 1.1806,
|
|
"mean_token_accuracy": 0.7153444170951844,
|
|
"num_tokens": 267989534.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.1644116441164412,
|
|
"grad_norm": 0.13015108313588736,
|
|
"learning_rate": 2.1540791883894604e-06,
|
|
"loss": 1.1632,
|
|
"mean_token_accuracy": 0.7171327114105225,
|
|
"num_tokens": 268856238.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 1.1657783244499111,
|
|
"grad_norm": 0.13493117084719072,
|
|
"learning_rate": 2.150556573199944e-06,
|
|
"loss": 1.2232,
|
|
"mean_token_accuracy": 0.709650868177414,
|
|
"num_tokens": 269761959.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.1671450047833811,
|
|
"grad_norm": 0.11381157568473028,
|
|
"learning_rate": 2.147033958010427e-06,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.7127987504005432,
|
|
"num_tokens": 270677428.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.168511685116851,
|
|
"grad_norm": 0.1448075017440773,
|
|
"learning_rate": 2.14351134282091e-06,
|
|
"loss": 1.2062,
|
|
"mean_token_accuracy": 0.7112777471542359,
|
|
"num_tokens": 271617854.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.169878365450321,
|
|
"grad_norm": 0.12355274178120798,
|
|
"learning_rate": 2.1399887276313937e-06,
|
|
"loss": 1.1461,
|
|
"mean_token_accuracy": 0.7222738027572632,
|
|
"num_tokens": 272563454.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.1712450457837913,
|
|
"grad_norm": 0.12157456876275988,
|
|
"learning_rate": 2.136466112441877e-06,
|
|
"loss": 1.2255,
|
|
"mean_token_accuracy": 0.7083093583583832,
|
|
"num_tokens": 273503103.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 1.180859375,
|
|
"epoch": 1.1726117261172613,
|
|
"grad_norm": 0.14169130825184922,
|
|
"learning_rate": 2.1329434972523604e-06,
|
|
"loss": 1.1871,
|
|
"mean_token_accuracy": 0.7143826067447663,
|
|
"num_tokens": 274417995.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.1739784064507313,
|
|
"grad_norm": 0.11067038339694696,
|
|
"learning_rate": 2.1294208820628435e-06,
|
|
"loss": 1.1806,
|
|
"mean_token_accuracy": 0.7157198190689087,
|
|
"num_tokens": 275315548.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 1.1753450867842012,
|
|
"grad_norm": 0.11678241629836797,
|
|
"learning_rate": 2.125898266873327e-06,
|
|
"loss": 1.2312,
|
|
"mean_token_accuracy": 0.7033393323421478,
|
|
"num_tokens": 276295727.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.1767117671176712,
|
|
"grad_norm": 0.1336886861134953,
|
|
"learning_rate": 2.12237565168381e-06,
|
|
"loss": 1.2243,
|
|
"mean_token_accuracy": 0.7072099626064301,
|
|
"num_tokens": 277224705.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.1780784474511412,
|
|
"grad_norm": 0.1329017186408969,
|
|
"learning_rate": 2.1188530364942938e-06,
|
|
"loss": 1.2024,
|
|
"mean_token_accuracy": 0.7104719340801239,
|
|
"num_tokens": 278150086.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.1794451277846112,
|
|
"grad_norm": 0.11480447210135773,
|
|
"learning_rate": 2.115330421304777e-06,
|
|
"loss": 1.1455,
|
|
"mean_token_accuracy": 0.7200540900230408,
|
|
"num_tokens": 279062043.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.1808118081180812,
|
|
"grad_norm": 0.12171120383050826,
|
|
"learning_rate": 2.1118078061152604e-06,
|
|
"loss": 1.2057,
|
|
"mean_token_accuracy": 0.7116602241992951,
|
|
"num_tokens": 280028824.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.1821784884515512,
|
|
"grad_norm": 0.13318327008519418,
|
|
"learning_rate": 2.1082851909257435e-06,
|
|
"loss": 1.1754,
|
|
"mean_token_accuracy": 0.7158086240291596,
|
|
"num_tokens": 280962269.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.1835451687850211,
|
|
"grad_norm": 0.10378684358194558,
|
|
"learning_rate": 2.1047625757362267e-06,
|
|
"loss": 1.1919,
|
|
"mean_token_accuracy": 0.7137446165084839,
|
|
"num_tokens": 281911218.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.1849118491184911,
|
|
"grad_norm": 0.12568387763179376,
|
|
"learning_rate": 2.1012399605467102e-06,
|
|
"loss": 1.2074,
|
|
"mean_token_accuracy": 0.7103706121444702,
|
|
"num_tokens": 282800040.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.186278529451961,
|
|
"grad_norm": 0.13319049277726738,
|
|
"learning_rate": 2.0977173453571933e-06,
|
|
"loss": 1.235,
|
|
"mean_token_accuracy": 0.7068395733833313,
|
|
"num_tokens": 283682358.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.187645209785431,
|
|
"grad_norm": 0.1139349200135377,
|
|
"learning_rate": 2.0941947301676765e-06,
|
|
"loss": 1.2352,
|
|
"mean_token_accuracy": 0.7057213425636292,
|
|
"num_tokens": 284555053.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.189011890118901,
|
|
"grad_norm": 0.13066783046937455,
|
|
"learning_rate": 2.09067211497816e-06,
|
|
"loss": 1.1613,
|
|
"mean_token_accuracy": 0.7169568300247192,
|
|
"num_tokens": 285491698.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.1903785704523713,
|
|
"grad_norm": 0.11590795522123674,
|
|
"learning_rate": 2.087149499788643e-06,
|
|
"loss": 1.1818,
|
|
"mean_token_accuracy": 0.7127669155597687,
|
|
"num_tokens": 286406503.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.1917452507858413,
|
|
"grad_norm": 0.13548778985027513,
|
|
"learning_rate": 2.0836268845991263e-06,
|
|
"loss": 1.2346,
|
|
"mean_token_accuracy": 0.7051625549793243,
|
|
"num_tokens": 287327006.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.1931119311193112,
|
|
"grad_norm": 0.12315138001233818,
|
|
"learning_rate": 2.08010426940961e-06,
|
|
"loss": 1.2235,
|
|
"mean_token_accuracy": 0.7098001062870025,
|
|
"num_tokens": 288277781.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 1.1944786114527812,
|
|
"grad_norm": 0.1347898828554493,
|
|
"learning_rate": 2.076581654220093e-06,
|
|
"loss": 1.2401,
|
|
"mean_token_accuracy": 0.7058073878288269,
|
|
"num_tokens": 289203815.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.1958452917862512,
|
|
"grad_norm": 0.12900138612986445,
|
|
"learning_rate": 2.0730590390305765e-06,
|
|
"loss": 1.1793,
|
|
"mean_token_accuracy": 0.7159371793270111,
|
|
"num_tokens": 290113901.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.1972119721197212,
|
|
"grad_norm": 0.13961297996326802,
|
|
"learning_rate": 2.0695364238410596e-06,
|
|
"loss": 1.1998,
|
|
"mean_token_accuracy": 0.7111155033111572,
|
|
"num_tokens": 291089205.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.1985786524531912,
|
|
"grad_norm": 0.12222480129253188,
|
|
"learning_rate": 2.066013808651543e-06,
|
|
"loss": 1.2081,
|
|
"mean_token_accuracy": 0.708154046535492,
|
|
"num_tokens": 292008477.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 1.221875,
|
|
"epoch": 1.1999453327866612,
|
|
"grad_norm": 0.16071544446737765,
|
|
"learning_rate": 2.0624911934620263e-06,
|
|
"loss": 1.2214,
|
|
"mean_token_accuracy": 0.7086666405200959,
|
|
"num_tokens": 292887209.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 1.2013120131201311,
|
|
"grad_norm": 0.12589691250224322,
|
|
"learning_rate": 2.05896857827251e-06,
|
|
"loss": 1.2384,
|
|
"mean_token_accuracy": 0.7022940516471863,
|
|
"num_tokens": 293856814.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.2026786934536011,
|
|
"grad_norm": 0.11989732393217377,
|
|
"learning_rate": 2.055445963082993e-06,
|
|
"loss": 1.1691,
|
|
"mean_token_accuracy": 0.7200933873653412,
|
|
"num_tokens": 294765032.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.2040453737870713,
|
|
"grad_norm": 0.14489527103273564,
|
|
"learning_rate": 2.0519233478934765e-06,
|
|
"loss": 1.1808,
|
|
"mean_token_accuracy": 0.7147626340389251,
|
|
"num_tokens": 295715543.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.2054120541205413,
|
|
"grad_norm": 0.11502951342549073,
|
|
"learning_rate": 2.0484007327039596e-06,
|
|
"loss": 1.2301,
|
|
"mean_token_accuracy": 0.7060695290565491,
|
|
"num_tokens": 296625710.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.2067787344540113,
|
|
"grad_norm": 0.12121153419944838,
|
|
"learning_rate": 2.0448781175144427e-06,
|
|
"loss": 1.1978,
|
|
"mean_token_accuracy": 0.712634825706482,
|
|
"num_tokens": 297538530.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.2081454147874813,
|
|
"grad_norm": 0.11657724563943406,
|
|
"learning_rate": 2.0413555023249263e-06,
|
|
"loss": 1.1921,
|
|
"mean_token_accuracy": 0.7135513603687287,
|
|
"num_tokens": 298470497.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.2095120951209513,
|
|
"grad_norm": 0.11531341414005884,
|
|
"learning_rate": 2.0378328871354094e-06,
|
|
"loss": 1.2,
|
|
"mean_token_accuracy": 0.7104939758777619,
|
|
"num_tokens": 299428739.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.2108787754544212,
|
|
"grad_norm": 0.12346690818414631,
|
|
"learning_rate": 2.034310271945893e-06,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.7122603833675385,
|
|
"num_tokens": 300393975.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.2122454557878912,
|
|
"grad_norm": 0.1317138780751567,
|
|
"learning_rate": 2.030787656756376e-06,
|
|
"loss": 1.1739,
|
|
"mean_token_accuracy": 0.7136736869812011,
|
|
"num_tokens": 301302014.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.2136121361213612,
|
|
"grad_norm": 0.11447813039930739,
|
|
"learning_rate": 2.027265041566859e-06,
|
|
"loss": 1.1806,
|
|
"mean_token_accuracy": 0.7156505763530732,
|
|
"num_tokens": 302216270.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.2149788164548312,
|
|
"grad_norm": 0.11814882820481575,
|
|
"learning_rate": 2.0237424263773427e-06,
|
|
"loss": 1.1933,
|
|
"mean_token_accuracy": 0.7140685200691224,
|
|
"num_tokens": 303104415.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.2163454967883012,
|
|
"grad_norm": 0.10485543521183677,
|
|
"learning_rate": 2.020219811187826e-06,
|
|
"loss": 1.1381,
|
|
"mean_token_accuracy": 0.7230295658111572,
|
|
"num_tokens": 304046821.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 1.2177121771217712,
|
|
"grad_norm": 0.11739019922043473,
|
|
"learning_rate": 2.0166971959983094e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7058708250522614,
|
|
"num_tokens": 304983338.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.2190788574552411,
|
|
"grad_norm": 0.14632658669671086,
|
|
"learning_rate": 2.0131745808087925e-06,
|
|
"loss": 1.1651,
|
|
"mean_token_accuracy": 0.7176449120044708,
|
|
"num_tokens": 305914064.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.2204455377887111,
|
|
"grad_norm": 0.12166464289094694,
|
|
"learning_rate": 2.009651965619276e-06,
|
|
"loss": 1.1771,
|
|
"mean_token_accuracy": 0.7155236184597016,
|
|
"num_tokens": 306837431.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 1.221812218122181,
|
|
"grad_norm": 0.13852561702490526,
|
|
"learning_rate": 2.0061293504297592e-06,
|
|
"loss": 1.2421,
|
|
"mean_token_accuracy": 0.7041119813919068,
|
|
"num_tokens": 307782336.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 1.176953125,
|
|
"epoch": 1.2231788984556513,
|
|
"grad_norm": 0.11894130142270219,
|
|
"learning_rate": 2.0026067352402428e-06,
|
|
"loss": 1.1769,
|
|
"mean_token_accuracy": 0.7161159813404083,
|
|
"num_tokens": 308711682.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.2245455787891213,
|
|
"grad_norm": 0.1357730224114179,
|
|
"learning_rate": 1.999084120050726e-06,
|
|
"loss": 1.24,
|
|
"mean_token_accuracy": 0.7056409418582916,
|
|
"num_tokens": 309636144.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.2259122591225913,
|
|
"grad_norm": 0.11492276861453361,
|
|
"learning_rate": 1.9955615048612094e-06,
|
|
"loss": 1.1501,
|
|
"mean_token_accuracy": 0.7205142140388489,
|
|
"num_tokens": 310551882.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.2272789394560613,
|
|
"grad_norm": 0.14393215036742432,
|
|
"learning_rate": 1.9920388896716926e-06,
|
|
"loss": 1.2075,
|
|
"mean_token_accuracy": 0.7107318580150604,
|
|
"num_tokens": 311486040.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.2286456197895312,
|
|
"grad_norm": 0.11464595178932414,
|
|
"learning_rate": 1.9885162744821757e-06,
|
|
"loss": 1.2189,
|
|
"mean_token_accuracy": 0.7117345869541168,
|
|
"num_tokens": 312406968.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.2300123001230012,
|
|
"grad_norm": 0.12284053573041742,
|
|
"learning_rate": 1.9849936592926592e-06,
|
|
"loss": 1.1875,
|
|
"mean_token_accuracy": 0.7119090914726257,
|
|
"num_tokens": 313267152.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 1.181640625,
|
|
"epoch": 1.2313789804564712,
|
|
"grad_norm": 0.1373877940849616,
|
|
"learning_rate": 1.9814710441031424e-06,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.7149649918079376,
|
|
"num_tokens": 314176228.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.2327456607899412,
|
|
"grad_norm": 0.12862158383495068,
|
|
"learning_rate": 1.9779484289136255e-06,
|
|
"loss": 1.1623,
|
|
"mean_token_accuracy": 0.7190293490886688,
|
|
"num_tokens": 315075368.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.2341123411234112,
|
|
"grad_norm": 0.1196391177245474,
|
|
"learning_rate": 1.974425813724109e-06,
|
|
"loss": 1.2173,
|
|
"mean_token_accuracy": 0.7092306077480316,
|
|
"num_tokens": 316004568.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.2354790214568812,
|
|
"grad_norm": 0.14504445547541248,
|
|
"learning_rate": 1.970903198534592e-06,
|
|
"loss": 1.1824,
|
|
"mean_token_accuracy": 0.7142476916313172,
|
|
"num_tokens": 316945499.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.2368457017903514,
|
|
"grad_norm": 0.11840724460703862,
|
|
"learning_rate": 1.9673805833450753e-06,
|
|
"loss": 1.206,
|
|
"mean_token_accuracy": 0.7093462586402893,
|
|
"num_tokens": 317881936.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.2382123821238213,
|
|
"grad_norm": 0.11463718012980191,
|
|
"learning_rate": 1.963857968155559e-06,
|
|
"loss": 1.2126,
|
|
"mean_token_accuracy": 0.7091154873371124,
|
|
"num_tokens": 318823017.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.2395790624572913,
|
|
"grad_norm": 0.14446738908756074,
|
|
"learning_rate": 1.960335352966042e-06,
|
|
"loss": 1.1815,
|
|
"mean_token_accuracy": 0.71705322265625,
|
|
"num_tokens": 319745907.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.2409457427907613,
|
|
"grad_norm": 0.12187894852797902,
|
|
"learning_rate": 1.9568127377765255e-06,
|
|
"loss": 1.2043,
|
|
"mean_token_accuracy": 0.7090617001056672,
|
|
"num_tokens": 320691829.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.2423124231242313,
|
|
"grad_norm": 0.1339142210641006,
|
|
"learning_rate": 1.9532901225870086e-06,
|
|
"loss": 1.1975,
|
|
"mean_token_accuracy": 0.7118914961814881,
|
|
"num_tokens": 321667188.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.2436791034577013,
|
|
"grad_norm": 0.13644197154972673,
|
|
"learning_rate": 1.949767507397492e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7165218651294708,
|
|
"num_tokens": 322560352.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.2450457837911713,
|
|
"grad_norm": 0.1298286979268115,
|
|
"learning_rate": 1.9462448922079753e-06,
|
|
"loss": 1.2043,
|
|
"mean_token_accuracy": 0.7108154237270355,
|
|
"num_tokens": 323471108.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.2464124641246412,
|
|
"grad_norm": 0.11973140839439672,
|
|
"learning_rate": 1.942722277018459e-06,
|
|
"loss": 1.1946,
|
|
"mean_token_accuracy": 0.7148497104644775,
|
|
"num_tokens": 324368333.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 1.2477791444581112,
|
|
"grad_norm": 0.11294679090206816,
|
|
"learning_rate": 1.939199661828942e-06,
|
|
"loss": 1.2269,
|
|
"mean_token_accuracy": 0.7050874531269073,
|
|
"num_tokens": 325280448.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.2491458247915812,
|
|
"grad_norm": 0.12217949282939658,
|
|
"learning_rate": 1.9356770466394255e-06,
|
|
"loss": 1.1456,
|
|
"mean_token_accuracy": 0.7217828333377838,
|
|
"num_tokens": 326129509.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.2505125051250512,
|
|
"grad_norm": 0.1896803049810488,
|
|
"learning_rate": 1.9321544314499086e-06,
|
|
"loss": 1.147,
|
|
"mean_token_accuracy": 0.7214407980442047,
|
|
"num_tokens": 327041128.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.2518791854585212,
|
|
"grad_norm": 0.12592305435196802,
|
|
"learning_rate": 1.9286318162603917e-06,
|
|
"loss": 1.1838,
|
|
"mean_token_accuracy": 0.7120064198970795,
|
|
"num_tokens": 327977834.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.2532458657919912,
|
|
"grad_norm": 0.12348405140146836,
|
|
"learning_rate": 1.9251092010708753e-06,
|
|
"loss": 1.1665,
|
|
"mean_token_accuracy": 0.7195306718349457,
|
|
"num_tokens": 328893874.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.2546125461254611,
|
|
"grad_norm": 0.1156238191079261,
|
|
"learning_rate": 1.9215865858813584e-06,
|
|
"loss": 1.2067,
|
|
"mean_token_accuracy": 0.7092220067977906,
|
|
"num_tokens": 329811172.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.2559792264589311,
|
|
"grad_norm": 0.12608958517973318,
|
|
"learning_rate": 1.918063970691842e-06,
|
|
"loss": 1.1995,
|
|
"mean_token_accuracy": 0.7103578567504882,
|
|
"num_tokens": 330738160.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.2573459067924013,
|
|
"grad_norm": 0.12926648413763284,
|
|
"learning_rate": 1.914541355502325e-06,
|
|
"loss": 1.1973,
|
|
"mean_token_accuracy": 0.709797489643097,
|
|
"num_tokens": 331616816.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.2587125871258713,
|
|
"grad_norm": 0.11069597509805713,
|
|
"learning_rate": 1.9110187403128082e-06,
|
|
"loss": 1.1698,
|
|
"mean_token_accuracy": 0.715229457616806,
|
|
"num_tokens": 332526400.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 1.176953125,
|
|
"epoch": 1.2600792674593413,
|
|
"grad_norm": 0.12631293559721116,
|
|
"learning_rate": 1.9074961251232918e-06,
|
|
"loss": 1.1835,
|
|
"mean_token_accuracy": 0.7151442527770996,
|
|
"num_tokens": 333488573.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.2614459477928113,
|
|
"grad_norm": 0.18925974010561242,
|
|
"learning_rate": 1.903973509933775e-06,
|
|
"loss": 1.1793,
|
|
"mean_token_accuracy": 0.7123641669750214,
|
|
"num_tokens": 334392554.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.2628126281262813,
|
|
"grad_norm": 0.12802036515187842,
|
|
"learning_rate": 1.9004508947442582e-06,
|
|
"loss": 1.2213,
|
|
"mean_token_accuracy": 0.7071435928344727,
|
|
"num_tokens": 335285196.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 1.141796875,
|
|
"epoch": 1.2641793084597512,
|
|
"grad_norm": 0.13384927691610438,
|
|
"learning_rate": 1.8969282795547418e-06,
|
|
"loss": 1.1526,
|
|
"mean_token_accuracy": 0.7195989072322846,
|
|
"num_tokens": 336183878.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.2655459887932212,
|
|
"grad_norm": 0.11178836532165949,
|
|
"learning_rate": 1.8934056643652249e-06,
|
|
"loss": 1.1937,
|
|
"mean_token_accuracy": 0.710956084728241,
|
|
"num_tokens": 337094560.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.2669126691266912,
|
|
"grad_norm": 0.11240243016879588,
|
|
"learning_rate": 1.889883049175708e-06,
|
|
"loss": 1.1894,
|
|
"mean_token_accuracy": 0.7141939282417298,
|
|
"num_tokens": 337995808.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.2682793494601612,
|
|
"grad_norm": 0.14188586230541972,
|
|
"learning_rate": 1.8863604339861916e-06,
|
|
"loss": 1.1503,
|
|
"mean_token_accuracy": 0.7196313381195069,
|
|
"num_tokens": 338914539.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 1.2696460297936314,
|
|
"grad_norm": 0.12395705222058516,
|
|
"learning_rate": 1.8828378187966747e-06,
|
|
"loss": 1.22,
|
|
"mean_token_accuracy": 0.7094048500061035,
|
|
"num_tokens": 339851776.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.2710127101271014,
|
|
"grad_norm": 0.1259047167448291,
|
|
"learning_rate": 1.8793152036071582e-06,
|
|
"loss": 1.1909,
|
|
"mean_token_accuracy": 0.7130910813808441,
|
|
"num_tokens": 340765564.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.2723793904605714,
|
|
"grad_norm": 0.12383926234821117,
|
|
"learning_rate": 1.8757925884176414e-06,
|
|
"loss": 1.1713,
|
|
"mean_token_accuracy": 0.714445048570633,
|
|
"num_tokens": 341703508.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.2737460707940413,
|
|
"grad_norm": 0.1185014567961521,
|
|
"learning_rate": 1.8722699732281247e-06,
|
|
"loss": 1.2478,
|
|
"mean_token_accuracy": 0.7004268884658813,
|
|
"num_tokens": 342693008.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.2751127511275113,
|
|
"grad_norm": 0.11765538396260983,
|
|
"learning_rate": 1.868747358038608e-06,
|
|
"loss": 1.2248,
|
|
"mean_token_accuracy": 0.7057516157627106,
|
|
"num_tokens": 343600116.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.2764794314609813,
|
|
"grad_norm": 0.11851917726742367,
|
|
"learning_rate": 1.8652247428490914e-06,
|
|
"loss": 1.1605,
|
|
"mean_token_accuracy": 0.7200560927391052,
|
|
"num_tokens": 344516976.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.2778461117944513,
|
|
"grad_norm": 0.11839324220328264,
|
|
"learning_rate": 1.8617021276595745e-06,
|
|
"loss": 1.2141,
|
|
"mean_token_accuracy": 0.7110005199909211,
|
|
"num_tokens": 345423078.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.2792127921279213,
|
|
"grad_norm": 0.12680206402317706,
|
|
"learning_rate": 1.858179512470058e-06,
|
|
"loss": 1.1805,
|
|
"mean_token_accuracy": 0.7176225483417511,
|
|
"num_tokens": 346333353.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.2805794724613913,
|
|
"grad_norm": 0.21856580182857493,
|
|
"learning_rate": 1.8546568972805412e-06,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7171274602413178,
|
|
"num_tokens": 347312378.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.2819461527948612,
|
|
"grad_norm": 0.11814831903056353,
|
|
"learning_rate": 1.8511342820910245e-06,
|
|
"loss": 1.1609,
|
|
"mean_token_accuracy": 0.7175111174583435,
|
|
"num_tokens": 348227611.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.2833128331283312,
|
|
"grad_norm": 0.13017722869322146,
|
|
"learning_rate": 1.8476116669015078e-06,
|
|
"loss": 1.1647,
|
|
"mean_token_accuracy": 0.7203461647033691,
|
|
"num_tokens": 349167117.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.2846795134618012,
|
|
"grad_norm": 0.1443720092454605,
|
|
"learning_rate": 1.8440890517119912e-06,
|
|
"loss": 1.1505,
|
|
"mean_token_accuracy": 0.720772248506546,
|
|
"num_tokens": 350098803.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.2860461937952712,
|
|
"grad_norm": 0.13375504132583943,
|
|
"learning_rate": 1.8405664365224743e-06,
|
|
"loss": 1.1727,
|
|
"mean_token_accuracy": 0.7169567465782165,
|
|
"num_tokens": 350986062.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.2874128741287412,
|
|
"grad_norm": 0.11414638207454728,
|
|
"learning_rate": 1.8370438213329578e-06,
|
|
"loss": 1.2382,
|
|
"mean_token_accuracy": 0.7054317593574524,
|
|
"num_tokens": 351957899.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.2887795544622112,
|
|
"grad_norm": 0.16895188590480362,
|
|
"learning_rate": 1.833521206143441e-06,
|
|
"loss": 1.1964,
|
|
"mean_token_accuracy": 0.7115585029125213,
|
|
"num_tokens": 352883664.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 1.2901462347956814,
|
|
"grad_norm": 0.13641259066008746,
|
|
"learning_rate": 1.8299985909539245e-06,
|
|
"loss": 1.2419,
|
|
"mean_token_accuracy": 0.7040064871311188,
|
|
"num_tokens": 353769701.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.2915129151291513,
|
|
"grad_norm": 0.13641229586232145,
|
|
"learning_rate": 1.8264759757644076e-06,
|
|
"loss": 1.1879,
|
|
"mean_token_accuracy": 0.713461571931839,
|
|
"num_tokens": 354684621.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.2928795954626213,
|
|
"grad_norm": 0.1287359979275486,
|
|
"learning_rate": 1.822953360574891e-06,
|
|
"loss": 1.1602,
|
|
"mean_token_accuracy": 0.7191748797893525,
|
|
"num_tokens": 355562695.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.2942462757960913,
|
|
"grad_norm": 0.11177024431461777,
|
|
"learning_rate": 1.8194307453853743e-06,
|
|
"loss": 1.157,
|
|
"mean_token_accuracy": 0.7201085150241852,
|
|
"num_tokens": 356470940.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 1.2265625,
|
|
"epoch": 1.2956129561295613,
|
|
"grad_norm": 0.12346489744996451,
|
|
"learning_rate": 1.8159081301958576e-06,
|
|
"loss": 1.2341,
|
|
"mean_token_accuracy": 0.7066496133804321,
|
|
"num_tokens": 357413999.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.2969796364630313,
|
|
"grad_norm": 0.11801713070974668,
|
|
"learning_rate": 1.8123855150063408e-06,
|
|
"loss": 1.1984,
|
|
"mean_token_accuracy": 0.7132660746574402,
|
|
"num_tokens": 358350189.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.2983463167965013,
|
|
"grad_norm": 0.11610755000388147,
|
|
"learning_rate": 1.8088628998168243e-06,
|
|
"loss": 1.1952,
|
|
"mean_token_accuracy": 0.7119006812572479,
|
|
"num_tokens": 359245597.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.2997129971299712,
|
|
"grad_norm": 0.14021471393663001,
|
|
"learning_rate": 1.8053402846273074e-06,
|
|
"loss": 1.1444,
|
|
"mean_token_accuracy": 0.7213627755641937,
|
|
"num_tokens": 360154413.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.3010796774634412,
|
|
"grad_norm": 0.1143706558151874,
|
|
"learning_rate": 1.8018176694377906e-06,
|
|
"loss": 1.1398,
|
|
"mean_token_accuracy": 0.7224727153778077,
|
|
"num_tokens": 361057943.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.3024463577969114,
|
|
"grad_norm": 0.14280539478339457,
|
|
"learning_rate": 1.798295054248274e-06,
|
|
"loss": 1.2157,
|
|
"mean_token_accuracy": 0.7082219243049621,
|
|
"num_tokens": 361949519.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.3038130381303814,
|
|
"grad_norm": 0.12946184075092582,
|
|
"learning_rate": 1.7947724390587572e-06,
|
|
"loss": 1.1715,
|
|
"mean_token_accuracy": 0.7174887239933014,
|
|
"num_tokens": 362849669.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.3051797184638514,
|
|
"grad_norm": 0.13522638746563145,
|
|
"learning_rate": 1.7912498238692408e-06,
|
|
"loss": 1.2126,
|
|
"mean_token_accuracy": 0.7096715033054352,
|
|
"num_tokens": 363795866.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.3065463987973214,
|
|
"grad_norm": 0.11864517750741678,
|
|
"learning_rate": 1.7877272086797239e-06,
|
|
"loss": 1.1747,
|
|
"mean_token_accuracy": 0.7168461143970489,
|
|
"num_tokens": 364718978.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 1.20546875,
|
|
"epoch": 1.3079130791307914,
|
|
"grad_norm": 0.11484097394369115,
|
|
"learning_rate": 1.7842045934902072e-06,
|
|
"loss": 1.2054,
|
|
"mean_token_accuracy": 0.707341468334198,
|
|
"num_tokens": 365657200.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.3092797594642613,
|
|
"grad_norm": 0.12136921775918083,
|
|
"learning_rate": 1.7806819783006908e-06,
|
|
"loss": 1.185,
|
|
"mean_token_accuracy": 0.715130192041397,
|
|
"num_tokens": 366573525.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.3106464397977313,
|
|
"grad_norm": 0.14920322158625068,
|
|
"learning_rate": 1.777159363111174e-06,
|
|
"loss": 1.1809,
|
|
"mean_token_accuracy": 0.7154152572154999,
|
|
"num_tokens": 367497736.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.3120131201312013,
|
|
"grad_norm": 0.12327434069368136,
|
|
"learning_rate": 1.773636747921657e-06,
|
|
"loss": 1.1749,
|
|
"mean_token_accuracy": 0.7165960133075714,
|
|
"num_tokens": 368370959.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.3133798004646713,
|
|
"grad_norm": 0.14200583660686783,
|
|
"learning_rate": 1.7701141327321406e-06,
|
|
"loss": 1.1725,
|
|
"mean_token_accuracy": 0.7174287140369415,
|
|
"num_tokens": 369327037.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.3147464807981413,
|
|
"grad_norm": 0.14839018125107545,
|
|
"learning_rate": 1.7665915175426237e-06,
|
|
"loss": 1.2029,
|
|
"mean_token_accuracy": 0.71070516705513,
|
|
"num_tokens": 370297506.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.3161131611316113,
|
|
"grad_norm": 0.19005598422537984,
|
|
"learning_rate": 1.763068902353107e-06,
|
|
"loss": 1.1633,
|
|
"mean_token_accuracy": 0.716217315196991,
|
|
"num_tokens": 371193145.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.3174798414650812,
|
|
"grad_norm": 0.12028462485465799,
|
|
"learning_rate": 1.7595462871635904e-06,
|
|
"loss": 1.2137,
|
|
"mean_token_accuracy": 0.7074035704135895,
|
|
"num_tokens": 372082900.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 1.3188465217985512,
|
|
"grad_norm": 0.13785403694664225,
|
|
"learning_rate": 1.7560236719740737e-06,
|
|
"loss": 1.1543,
|
|
"mean_token_accuracy": 0.720289021730423,
|
|
"num_tokens": 373016609.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.3202132021320212,
|
|
"grad_norm": 0.12772712283876467,
|
|
"learning_rate": 1.752501056784557e-06,
|
|
"loss": 1.1898,
|
|
"mean_token_accuracy": 0.7175466477870941,
|
|
"num_tokens": 373964826.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 1.3215798824654912,
|
|
"grad_norm": 0.11569044976453231,
|
|
"learning_rate": 1.7489784415950404e-06,
|
|
"loss": 1.1935,
|
|
"mean_token_accuracy": 0.7118521332740784,
|
|
"num_tokens": 374849897.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 1.1515625,
|
|
"epoch": 1.3229465627989614,
|
|
"grad_norm": 0.1517506854671187,
|
|
"learning_rate": 1.7454558264055235e-06,
|
|
"loss": 1.1416,
|
|
"mean_token_accuracy": 0.72141934633255,
|
|
"num_tokens": 375745745.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.3243132431324314,
|
|
"grad_norm": 0.1407191993430954,
|
|
"learning_rate": 1.741933211216007e-06,
|
|
"loss": 1.183,
|
|
"mean_token_accuracy": 0.7163554668426514,
|
|
"num_tokens": 376635869.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.3256799234659014,
|
|
"grad_norm": 0.12300563422999793,
|
|
"learning_rate": 1.7384105960264902e-06,
|
|
"loss": 1.1801,
|
|
"mean_token_accuracy": 0.7187091171741485,
|
|
"num_tokens": 377564535.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 1.2484375,
|
|
"epoch": 1.3270466037993713,
|
|
"grad_norm": 0.13190521971033986,
|
|
"learning_rate": 1.7348879808369735e-06,
|
|
"loss": 1.2471,
|
|
"mean_token_accuracy": 0.6994648754596711,
|
|
"num_tokens": 378514613.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.3284132841328413,
|
|
"grad_norm": 0.12166367944410339,
|
|
"learning_rate": 1.7313653656474568e-06,
|
|
"loss": 1.1759,
|
|
"mean_token_accuracy": 0.7147725522518158,
|
|
"num_tokens": 379461912.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 1.3297799644663113,
|
|
"grad_norm": 0.15181389881389726,
|
|
"learning_rate": 1.7278427504579402e-06,
|
|
"loss": 1.246,
|
|
"mean_token_accuracy": 0.7046543836593628,
|
|
"num_tokens": 380419046.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.3311466447997813,
|
|
"grad_norm": 0.1178725617287622,
|
|
"learning_rate": 1.7243201352684233e-06,
|
|
"loss": 1.2431,
|
|
"mean_token_accuracy": 0.703360515832901,
|
|
"num_tokens": 381381738.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.3325133251332513,
|
|
"grad_norm": 0.1284767319574015,
|
|
"learning_rate": 1.7207975200789068e-06,
|
|
"loss": 1.2114,
|
|
"mean_token_accuracy": 0.7082740306854248,
|
|
"num_tokens": 382305754.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.3338800054667213,
|
|
"grad_norm": 0.17019525627316304,
|
|
"learning_rate": 1.71727490488939e-06,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.7120652139186859,
|
|
"num_tokens": 383177708.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.3352466858001915,
|
|
"grad_norm": 0.1270184231794503,
|
|
"learning_rate": 1.7137522896998735e-06,
|
|
"loss": 1.1757,
|
|
"mean_token_accuracy": 0.7154577732086181,
|
|
"num_tokens": 384108839.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.3366133661336614,
|
|
"grad_norm": 0.11751841698621558,
|
|
"learning_rate": 1.7102296745103566e-06,
|
|
"loss": 1.1477,
|
|
"mean_token_accuracy": 0.7236140489578247,
|
|
"num_tokens": 384994049.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 1.195703125,
|
|
"epoch": 1.3379800464671314,
|
|
"grad_norm": 0.13758351451934958,
|
|
"learning_rate": 1.7067070593208398e-06,
|
|
"loss": 1.1965,
|
|
"mean_token_accuracy": 0.7116427183151245,
|
|
"num_tokens": 385940948.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.3393467268006014,
|
|
"grad_norm": 0.11412294928393561,
|
|
"learning_rate": 1.7031844441313233e-06,
|
|
"loss": 1.1891,
|
|
"mean_token_accuracy": 0.7143591105937958,
|
|
"num_tokens": 386868665.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.3407134071340714,
|
|
"grad_norm": 0.11952992582834501,
|
|
"learning_rate": 1.6996618289418064e-06,
|
|
"loss": 1.1769,
|
|
"mean_token_accuracy": 0.7169935762882232,
|
|
"num_tokens": 387801284.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.3420800874675414,
|
|
"grad_norm": 0.1321118868994828,
|
|
"learning_rate": 1.6961392137522898e-06,
|
|
"loss": 1.189,
|
|
"mean_token_accuracy": 0.7109399378299713,
|
|
"num_tokens": 388671301.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.3434467678010114,
|
|
"grad_norm": 0.11170003628172485,
|
|
"learning_rate": 1.6926165985627733e-06,
|
|
"loss": 1.1276,
|
|
"mean_token_accuracy": 0.7254051268100739,
|
|
"num_tokens": 389547566.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 1.3448134481344813,
|
|
"grad_norm": 0.1283419301502496,
|
|
"learning_rate": 1.6890939833732564e-06,
|
|
"loss": 1.2157,
|
|
"mean_token_accuracy": 0.7074860155582428,
|
|
"num_tokens": 390499064.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.3461801284679513,
|
|
"grad_norm": 0.11990264415360036,
|
|
"learning_rate": 1.6855713681837396e-06,
|
|
"loss": 1.217,
|
|
"mean_token_accuracy": 0.7085027992725372,
|
|
"num_tokens": 391433860.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.3475468088014213,
|
|
"grad_norm": 0.13030681734907287,
|
|
"learning_rate": 1.6820487529942231e-06,
|
|
"loss": 1.1635,
|
|
"mean_token_accuracy": 0.7175294041633606,
|
|
"num_tokens": 392346129.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.3489134891348913,
|
|
"grad_norm": 0.14232246479685048,
|
|
"learning_rate": 1.6785261378047062e-06,
|
|
"loss": 1.1531,
|
|
"mean_token_accuracy": 0.7197709262371064,
|
|
"num_tokens": 393262586.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 1.3502801694683613,
|
|
"grad_norm": 0.12684213008995848,
|
|
"learning_rate": 1.6750035226151898e-06,
|
|
"loss": 1.1826,
|
|
"mean_token_accuracy": 0.7152134656906128,
|
|
"num_tokens": 394137775.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.3516468498018313,
|
|
"grad_norm": 0.12387326620304413,
|
|
"learning_rate": 1.671480907425673e-06,
|
|
"loss": 1.1662,
|
|
"mean_token_accuracy": 0.7164386332035064,
|
|
"num_tokens": 395076218.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.3530135301353012,
|
|
"grad_norm": 0.11847690497599249,
|
|
"learning_rate": 1.6679582922361562e-06,
|
|
"loss": 1.1839,
|
|
"mean_token_accuracy": 0.7160084486007691,
|
|
"num_tokens": 395972797.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.3543802104687712,
|
|
"grad_norm": 0.13210307930268192,
|
|
"learning_rate": 1.6644356770466396e-06,
|
|
"loss": 1.184,
|
|
"mean_token_accuracy": 0.7140582203865051,
|
|
"num_tokens": 396908337.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.3557468908022414,
|
|
"grad_norm": 0.13853201116572614,
|
|
"learning_rate": 1.660913061857123e-06,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.7132121801376343,
|
|
"num_tokens": 397843452.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.3571135711357114,
|
|
"grad_norm": 0.12293515890164981,
|
|
"learning_rate": 1.657390446667606e-06,
|
|
"loss": 1.1341,
|
|
"mean_token_accuracy": 0.7255500733852387,
|
|
"num_tokens": 398764890.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.3584802514691814,
|
|
"grad_norm": 0.11823294851071559,
|
|
"learning_rate": 1.6538678314780896e-06,
|
|
"loss": 1.1851,
|
|
"mean_token_accuracy": 0.714233136177063,
|
|
"num_tokens": 399672201.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 1.23984375,
|
|
"epoch": 1.3598469318026514,
|
|
"grad_norm": 0.13294108433559737,
|
|
"learning_rate": 1.6503452162885727e-06,
|
|
"loss": 1.25,
|
|
"mean_token_accuracy": 0.7001692950725555,
|
|
"num_tokens": 400567434.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.3612136121361214,
|
|
"grad_norm": 0.11436034607074078,
|
|
"learning_rate": 1.646822601099056e-06,
|
|
"loss": 1.185,
|
|
"mean_token_accuracy": 0.7145607173442841,
|
|
"num_tokens": 401483958.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.3625802924695913,
|
|
"grad_norm": 0.12179010385166884,
|
|
"learning_rate": 1.6432999859095394e-06,
|
|
"loss": 1.1886,
|
|
"mean_token_accuracy": 0.7152128875255584,
|
|
"num_tokens": 402403665.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 1.23828125,
|
|
"epoch": 1.3639469728030613,
|
|
"grad_norm": 0.14316631309861824,
|
|
"learning_rate": 1.6397773707200227e-06,
|
|
"loss": 1.2403,
|
|
"mean_token_accuracy": 0.7023570358753204,
|
|
"num_tokens": 403335419.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.3653136531365313,
|
|
"grad_norm": 0.11491903438931578,
|
|
"learning_rate": 1.636254755530506e-06,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.7143580317497253,
|
|
"num_tokens": 404219526.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 1.1140625,
|
|
"epoch": 1.3666803334700013,
|
|
"grad_norm": 0.11711989559703669,
|
|
"learning_rate": 1.6327321403409894e-06,
|
|
"loss": 1.1224,
|
|
"mean_token_accuracy": 0.7256307959556579,
|
|
"num_tokens": 405131412.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.3680470138034715,
|
|
"grad_norm": 0.1429840821174436,
|
|
"learning_rate": 1.6292095251514725e-06,
|
|
"loss": 1.2082,
|
|
"mean_token_accuracy": 0.7097049057483673,
|
|
"num_tokens": 406040447.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.3694136941369415,
|
|
"grad_norm": 0.15947812704082942,
|
|
"learning_rate": 1.625686909961956e-06,
|
|
"loss": 1.2161,
|
|
"mean_token_accuracy": 0.7076008975505829,
|
|
"num_tokens": 406981543.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.3707803744704115,
|
|
"grad_norm": 0.16626265363789886,
|
|
"learning_rate": 1.6221642947724392e-06,
|
|
"loss": 1.2003,
|
|
"mean_token_accuracy": 0.7126982271671295,
|
|
"num_tokens": 407857828.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 1.2390625,
|
|
"epoch": 1.3721470548038814,
|
|
"grad_norm": 0.12494512856614057,
|
|
"learning_rate": 1.6186416795829223e-06,
|
|
"loss": 1.2303,
|
|
"mean_token_accuracy": 0.7059546649456024,
|
|
"num_tokens": 408802466.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 1.21953125,
|
|
"epoch": 1.3735137351373514,
|
|
"grad_norm": 0.1341783677476121,
|
|
"learning_rate": 1.6151190643934058e-06,
|
|
"loss": 1.2274,
|
|
"mean_token_accuracy": 0.7048985183238983,
|
|
"num_tokens": 409739616.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.3748804154708214,
|
|
"grad_norm": 0.11806113634278109,
|
|
"learning_rate": 1.6115964492038892e-06,
|
|
"loss": 1.1711,
|
|
"mean_token_accuracy": 0.7176142632961273,
|
|
"num_tokens": 410650551.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.3762470958042914,
|
|
"grad_norm": 0.11227181532303977,
|
|
"learning_rate": 1.6080738340143723e-06,
|
|
"loss": 1.1841,
|
|
"mean_token_accuracy": 0.7156656563282013,
|
|
"num_tokens": 411591278.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.3776137761377614,
|
|
"grad_norm": 0.15304186595476815,
|
|
"learning_rate": 1.6045512188248559e-06,
|
|
"loss": 1.138,
|
|
"mean_token_accuracy": 0.723748505115509,
|
|
"num_tokens": 412462066.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 1.189453125,
|
|
"epoch": 1.3789804564712314,
|
|
"grad_norm": 0.119742351232881,
|
|
"learning_rate": 1.601028603635339e-06,
|
|
"loss": 1.1922,
|
|
"mean_token_accuracy": 0.7145638465881348,
|
|
"num_tokens": 413356863.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.3803471368047013,
|
|
"grad_norm": 0.13539001408924614,
|
|
"learning_rate": 1.5975059884458225e-06,
|
|
"loss": 1.2141,
|
|
"mean_token_accuracy": 0.7081993460655213,
|
|
"num_tokens": 414330061.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.3817138171381713,
|
|
"grad_norm": 0.12302108656413474,
|
|
"learning_rate": 1.5939833732563056e-06,
|
|
"loss": 1.1906,
|
|
"mean_token_accuracy": 0.7140569567680359,
|
|
"num_tokens": 415253697.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.3830804974716413,
|
|
"grad_norm": 0.11080258467539036,
|
|
"learning_rate": 1.5904607580667888e-06,
|
|
"loss": 1.19,
|
|
"mean_token_accuracy": 0.7118586421012878,
|
|
"num_tokens": 416203206.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.3844471778051113,
|
|
"grad_norm": 0.1553342878583109,
|
|
"learning_rate": 1.5869381428772723e-06,
|
|
"loss": 1.1985,
|
|
"mean_token_accuracy": 0.7122500121593476,
|
|
"num_tokens": 417115492.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 1.3858138581385813,
|
|
"grad_norm": 0.12699627762028695,
|
|
"learning_rate": 1.5834155276877554e-06,
|
|
"loss": 1.2112,
|
|
"mean_token_accuracy": 0.7102247536182403,
|
|
"num_tokens": 418021895.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.3871805384720512,
|
|
"grad_norm": 0.1448074188564779,
|
|
"learning_rate": 1.5798929124982388e-06,
|
|
"loss": 1.1877,
|
|
"mean_token_accuracy": 0.7118501245975495,
|
|
"num_tokens": 418902258.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.3885472188055215,
|
|
"grad_norm": 0.11968080489190475,
|
|
"learning_rate": 1.5763702973087221e-06,
|
|
"loss": 1.1985,
|
|
"mean_token_accuracy": 0.7109178841114044,
|
|
"num_tokens": 419795987.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.3899138991389914,
|
|
"grad_norm": 0.13349739640889682,
|
|
"learning_rate": 1.5728476821192054e-06,
|
|
"loss": 1.1497,
|
|
"mean_token_accuracy": 0.720890897512436,
|
|
"num_tokens": 420706336.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 1.24296875,
|
|
"epoch": 1.3912805794724614,
|
|
"grad_norm": 0.12219273854618498,
|
|
"learning_rate": 1.5693250669296886e-06,
|
|
"loss": 1.2515,
|
|
"mean_token_accuracy": 0.7023664534091949,
|
|
"num_tokens": 421662147.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.3926472598059314,
|
|
"grad_norm": 0.12261669733848543,
|
|
"learning_rate": 1.5658024517401721e-06,
|
|
"loss": 1.1744,
|
|
"mean_token_accuracy": 0.7164570271968842,
|
|
"num_tokens": 422577714.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.3940139401394014,
|
|
"grad_norm": 0.1257899285535477,
|
|
"learning_rate": 1.5622798365506552e-06,
|
|
"loss": 1.167,
|
|
"mean_token_accuracy": 0.7190118074417114,
|
|
"num_tokens": 423518824.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.3953806204728714,
|
|
"grad_norm": 0.12162041109803752,
|
|
"learning_rate": 1.5587572213611388e-06,
|
|
"loss": 1.1936,
|
|
"mean_token_accuracy": 0.7141225636005402,
|
|
"num_tokens": 424435989.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.3967473008063414,
|
|
"grad_norm": 0.11981948828415302,
|
|
"learning_rate": 1.555234606171622e-06,
|
|
"loss": 1.1907,
|
|
"mean_token_accuracy": 0.7141438543796539,
|
|
"num_tokens": 425383710.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.3981139811398113,
|
|
"grad_norm": 0.1230061974939298,
|
|
"learning_rate": 1.5517119909821052e-06,
|
|
"loss": 1.2067,
|
|
"mean_token_accuracy": 0.7105968654155731,
|
|
"num_tokens": 426338982.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.3994806614732813,
|
|
"grad_norm": 0.11469013510908216,
|
|
"learning_rate": 1.5481893757925886e-06,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.7129550576210022,
|
|
"num_tokens": 427285007.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 1.21328125,
|
|
"epoch": 1.4008473418067515,
|
|
"grad_norm": 0.13290562489723648,
|
|
"learning_rate": 1.544666760603072e-06,
|
|
"loss": 1.2186,
|
|
"mean_token_accuracy": 0.7071216821670532,
|
|
"num_tokens": 428164675.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 1.225,
|
|
"epoch": 1.4022140221402215,
|
|
"grad_norm": 0.12208787980032111,
|
|
"learning_rate": 1.541144145413555e-06,
|
|
"loss": 1.2404,
|
|
"mean_token_accuracy": 0.7027550578117371,
|
|
"num_tokens": 429111876.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.4035807024736915,
|
|
"grad_norm": 0.12028751164788541,
|
|
"learning_rate": 1.5376215302240386e-06,
|
|
"loss": 1.2158,
|
|
"mean_token_accuracy": 0.7104494333267212,
|
|
"num_tokens": 430098600.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.4049473828071615,
|
|
"grad_norm": 0.12382119909181938,
|
|
"learning_rate": 1.5340989150345217e-06,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.712181031703949,
|
|
"num_tokens": 431031276.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.4063140631406315,
|
|
"grad_norm": 0.117265924051645,
|
|
"learning_rate": 1.5305762998450048e-06,
|
|
"loss": 1.1556,
|
|
"mean_token_accuracy": 0.7216331601142884,
|
|
"num_tokens": 431937681.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 1.2546875,
|
|
"epoch": 1.4076807434741014,
|
|
"grad_norm": 0.15157856486995377,
|
|
"learning_rate": 1.5270536846554884e-06,
|
|
"loss": 1.2594,
|
|
"mean_token_accuracy": 0.7011895596981048,
|
|
"num_tokens": 432856353.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.4090474238075714,
|
|
"grad_norm": 0.13239243372645754,
|
|
"learning_rate": 1.5235310694659717e-06,
|
|
"loss": 1.2097,
|
|
"mean_token_accuracy": 0.7082525253295898,
|
|
"num_tokens": 433796865.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.4104141041410414,
|
|
"grad_norm": 0.14142989212330603,
|
|
"learning_rate": 1.5200084542764548e-06,
|
|
"loss": 1.1455,
|
|
"mean_token_accuracy": 0.7209932982921601,
|
|
"num_tokens": 434736283.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.4117807844745114,
|
|
"grad_norm": 0.12242545170011394,
|
|
"learning_rate": 1.5164858390869384e-06,
|
|
"loss": 1.1887,
|
|
"mean_token_accuracy": 0.7116934299468994,
|
|
"num_tokens": 435655129.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 1.2515625,
|
|
"epoch": 1.4131474648079814,
|
|
"grad_norm": 0.18227182327113786,
|
|
"learning_rate": 1.5129632238974215e-06,
|
|
"loss": 1.256,
|
|
"mean_token_accuracy": 0.7038849532604218,
|
|
"num_tokens": 436543272.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.4145141451414514,
|
|
"grad_norm": 0.12279610562858696,
|
|
"learning_rate": 1.509440608707905e-06,
|
|
"loss": 1.1832,
|
|
"mean_token_accuracy": 0.714367824792862,
|
|
"num_tokens": 437493506.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.4158808254749213,
|
|
"grad_norm": 0.12892224143703637,
|
|
"learning_rate": 1.5059179935183882e-06,
|
|
"loss": 1.2214,
|
|
"mean_token_accuracy": 0.7065628349781037,
|
|
"num_tokens": 438408794.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.4172475058083913,
|
|
"grad_norm": 0.12260536083914542,
|
|
"learning_rate": 1.5023953783288713e-06,
|
|
"loss": 1.1976,
|
|
"mean_token_accuracy": 0.7145713806152344,
|
|
"num_tokens": 439340151.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.4186141861418613,
|
|
"grad_norm": 0.11347768184689792,
|
|
"learning_rate": 1.4988727631393549e-06,
|
|
"loss": 1.1913,
|
|
"mean_token_accuracy": 0.7118435978889466,
|
|
"num_tokens": 440251200.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.4199808664753313,
|
|
"grad_norm": 0.13072437077965618,
|
|
"learning_rate": 1.495350147949838e-06,
|
|
"loss": 1.2275,
|
|
"mean_token_accuracy": 0.7088172912597657,
|
|
"num_tokens": 441208474.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.4213475468088015,
|
|
"grad_norm": 0.12107719033928728,
|
|
"learning_rate": 1.4918275327603213e-06,
|
|
"loss": 1.1655,
|
|
"mean_token_accuracy": 0.7205856680870056,
|
|
"num_tokens": 442086160.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.4227142271422715,
|
|
"grad_norm": 0.11106655379021554,
|
|
"learning_rate": 1.4883049175708047e-06,
|
|
"loss": 1.1755,
|
|
"mean_token_accuracy": 0.7172904133796691,
|
|
"num_tokens": 442956910.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 1.202734375,
|
|
"epoch": 1.4240809074757415,
|
|
"grad_norm": 0.11443046074388308,
|
|
"learning_rate": 1.484782302381288e-06,
|
|
"loss": 1.1882,
|
|
"mean_token_accuracy": 0.7125543653964996,
|
|
"num_tokens": 443882723.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.4254475878092114,
|
|
"grad_norm": 0.12631316624957978,
|
|
"learning_rate": 1.4812596871917711e-06,
|
|
"loss": 1.2051,
|
|
"mean_token_accuracy": 0.7116193890571594,
|
|
"num_tokens": 444783212.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.4268142681426814,
|
|
"grad_norm": 0.12769435184706665,
|
|
"learning_rate": 1.4777370720022547e-06,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.7182230830192566,
|
|
"num_tokens": 445698999.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.4281809484761514,
|
|
"grad_norm": 0.11806999651085748,
|
|
"learning_rate": 1.4742144568127378e-06,
|
|
"loss": 1.1784,
|
|
"mean_token_accuracy": 0.7149154603481293,
|
|
"num_tokens": 446652381.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.4295476288096214,
|
|
"grad_norm": 0.12028326634722751,
|
|
"learning_rate": 1.4706918416232213e-06,
|
|
"loss": 1.1353,
|
|
"mean_token_accuracy": 0.7242217242717743,
|
|
"num_tokens": 447562335.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 1.4309143091430914,
|
|
"grad_norm": 0.1243128300393688,
|
|
"learning_rate": 1.4671692264337045e-06,
|
|
"loss": 1.213,
|
|
"mean_token_accuracy": 0.7087359666824341,
|
|
"num_tokens": 448516535.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.4322809894765614,
|
|
"grad_norm": 0.12724366999240672,
|
|
"learning_rate": 1.4636466112441878e-06,
|
|
"loss": 1.1804,
|
|
"mean_token_accuracy": 0.7127282917499542,
|
|
"num_tokens": 449467717.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.4336476698100316,
|
|
"grad_norm": 0.1835914511072079,
|
|
"learning_rate": 1.4601239960546711e-06,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.7182330369949341,
|
|
"num_tokens": 450386399.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 1.4350143501435015,
|
|
"grad_norm": 0.12310172550480912,
|
|
"learning_rate": 1.4566013808651545e-06,
|
|
"loss": 1.2276,
|
|
"mean_token_accuracy": 0.7054740130901337,
|
|
"num_tokens": 451354547.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.4363810304769715,
|
|
"grad_norm": 0.1273286494095455,
|
|
"learning_rate": 1.4530787656756376e-06,
|
|
"loss": 1.157,
|
|
"mean_token_accuracy": 0.7205774664878846,
|
|
"num_tokens": 452306886.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.4377477108104415,
|
|
"grad_norm": 0.1283587110852586,
|
|
"learning_rate": 1.4495561504861211e-06,
|
|
"loss": 1.1812,
|
|
"mean_token_accuracy": 0.714612478017807,
|
|
"num_tokens": 453277261.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.4391143911439115,
|
|
"grad_norm": 0.10847778547042615,
|
|
"learning_rate": 1.4460335352966043e-06,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7181640505790711,
|
|
"num_tokens": 454188029.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 1.25390625,
|
|
"epoch": 1.4404810714773815,
|
|
"grad_norm": 0.119495829349321,
|
|
"learning_rate": 1.4425109201070876e-06,
|
|
"loss": 1.2539,
|
|
"mean_token_accuracy": 0.7000500679016113,
|
|
"num_tokens": 455193925.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.4418477518108515,
|
|
"grad_norm": 0.12315610504930043,
|
|
"learning_rate": 1.438988304917571e-06,
|
|
"loss": 1.1913,
|
|
"mean_token_accuracy": 0.7114707946777343,
|
|
"num_tokens": 456135320.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.4432144321443214,
|
|
"grad_norm": 0.11431507458093951,
|
|
"learning_rate": 1.4354656897280543e-06,
|
|
"loss": 1.1855,
|
|
"mean_token_accuracy": 0.7133112609386444,
|
|
"num_tokens": 457069802.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 1.167578125,
|
|
"epoch": 1.4445811124777914,
|
|
"grad_norm": 0.11931202713971453,
|
|
"learning_rate": 1.4319430745385376e-06,
|
|
"loss": 1.1688,
|
|
"mean_token_accuracy": 0.7160093009471893,
|
|
"num_tokens": 457985176.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.4459477928112614,
|
|
"grad_norm": 0.13435186009314423,
|
|
"learning_rate": 1.428420459349021e-06,
|
|
"loss": 1.1826,
|
|
"mean_token_accuracy": 0.7145166099071503,
|
|
"num_tokens": 458901754.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.4473144731447314,
|
|
"grad_norm": 0.12936609515975853,
|
|
"learning_rate": 1.424897844159504e-06,
|
|
"loss": 1.1462,
|
|
"mean_token_accuracy": 0.7219720363616944,
|
|
"num_tokens": 459822265.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.4486811534782014,
|
|
"grad_norm": 0.12983540826208584,
|
|
"learning_rate": 1.4213752289699876e-06,
|
|
"loss": 1.1627,
|
|
"mean_token_accuracy": 0.7173291385173798,
|
|
"num_tokens": 460728165.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.4500478338116713,
|
|
"grad_norm": 0.11383999821124423,
|
|
"learning_rate": 1.4178526137804707e-06,
|
|
"loss": 1.1406,
|
|
"mean_token_accuracy": 0.7219341397285461,
|
|
"num_tokens": 461626029.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.4514145141451413,
|
|
"grad_norm": 0.3518384291971492,
|
|
"learning_rate": 1.4143299985909538e-06,
|
|
"loss": 1.1421,
|
|
"mean_token_accuracy": 0.723987627029419,
|
|
"num_tokens": 462517789.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.4527811944786113,
|
|
"grad_norm": 0.12193196594782105,
|
|
"learning_rate": 1.4108073834014374e-06,
|
|
"loss": 1.1461,
|
|
"mean_token_accuracy": 0.7209883809089661,
|
|
"num_tokens": 463466651.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.4541478748120815,
|
|
"grad_norm": 0.12605874099132533,
|
|
"learning_rate": 1.4072847682119205e-06,
|
|
"loss": 1.1663,
|
|
"mean_token_accuracy": 0.7185406863689423,
|
|
"num_tokens": 464412649.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.4555145551455515,
|
|
"grad_norm": 0.11572325756943666,
|
|
"learning_rate": 1.4037621530224039e-06,
|
|
"loss": 1.1954,
|
|
"mean_token_accuracy": 0.7135636150836945,
|
|
"num_tokens": 465389049.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.4568812354790215,
|
|
"grad_norm": 0.11851702604197216,
|
|
"learning_rate": 1.4002395378328872e-06,
|
|
"loss": 1.1633,
|
|
"mean_token_accuracy": 0.7157575011253356,
|
|
"num_tokens": 466347450.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.4582479158124915,
|
|
"grad_norm": 0.1341561090177334,
|
|
"learning_rate": 1.3967169226433705e-06,
|
|
"loss": 1.1409,
|
|
"mean_token_accuracy": 0.7232238173484802,
|
|
"num_tokens": 467266497.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 1.4596145961459615,
|
|
"grad_norm": 0.12609430908413768,
|
|
"learning_rate": 1.393194307453854e-06,
|
|
"loss": 1.1966,
|
|
"mean_token_accuracy": 0.7117892980575562,
|
|
"num_tokens": 468203410.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.4609812764794314,
|
|
"grad_norm": 0.12249845786679855,
|
|
"learning_rate": 1.3896716922643372e-06,
|
|
"loss": 1.2166,
|
|
"mean_token_accuracy": 0.7073101282119751,
|
|
"num_tokens": 469106694.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.4623479568129014,
|
|
"grad_norm": 0.1161829048567437,
|
|
"learning_rate": 1.3861490770748203e-06,
|
|
"loss": 1.1719,
|
|
"mean_token_accuracy": 0.7174733579158783,
|
|
"num_tokens": 470043200.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.4637146371463714,
|
|
"grad_norm": 0.13740128930989098,
|
|
"learning_rate": 1.3826264618853039e-06,
|
|
"loss": 1.1992,
|
|
"mean_token_accuracy": 0.7105812191963196,
|
|
"num_tokens": 470915220.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.4650813174798414,
|
|
"grad_norm": 0.15213490676141175,
|
|
"learning_rate": 1.379103846695787e-06,
|
|
"loss": 1.2169,
|
|
"mean_token_accuracy": 0.7084613800048828,
|
|
"num_tokens": 471906739.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 1.4664479978133116,
|
|
"grad_norm": 0.128817494892557,
|
|
"learning_rate": 1.3755812315062703e-06,
|
|
"loss": 1.2126,
|
|
"mean_token_accuracy": 0.7090679824352264,
|
|
"num_tokens": 472819447.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.4678146781467816,
|
|
"grad_norm": 0.11152127185500739,
|
|
"learning_rate": 1.3720586163167537e-06,
|
|
"loss": 1.1782,
|
|
"mean_token_accuracy": 0.7148505210876465,
|
|
"num_tokens": 473735873.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.4691813584802516,
|
|
"grad_norm": 0.1756048847425214,
|
|
"learning_rate": 1.368536001127237e-06,
|
|
"loss": 1.2108,
|
|
"mean_token_accuracy": 0.7103855311870575,
|
|
"num_tokens": 474628917.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.4705480388137215,
|
|
"grad_norm": 0.11419149374421857,
|
|
"learning_rate": 1.3650133859377201e-06,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.7178873836994171,
|
|
"num_tokens": 475543698.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.4719147191471915,
|
|
"grad_norm": 0.1226789883052144,
|
|
"learning_rate": 1.3614907707482037e-06,
|
|
"loss": 1.1871,
|
|
"mean_token_accuracy": 0.7134617805480957,
|
|
"num_tokens": 476440932.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 1.26484375,
|
|
"epoch": 1.4732813994806615,
|
|
"grad_norm": 0.1197066051864847,
|
|
"learning_rate": 1.3579681555586868e-06,
|
|
"loss": 1.2587,
|
|
"mean_token_accuracy": 0.7027205884456634,
|
|
"num_tokens": 477391065.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 1.19375,
|
|
"epoch": 1.4746480798141315,
|
|
"grad_norm": 0.11283186868863818,
|
|
"learning_rate": 1.3544455403691703e-06,
|
|
"loss": 1.2007,
|
|
"mean_token_accuracy": 0.7115746319293976,
|
|
"num_tokens": 478340658.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 1.16171875,
|
|
"epoch": 1.4760147601476015,
|
|
"grad_norm": 0.12901522235161597,
|
|
"learning_rate": 1.3509229251796535e-06,
|
|
"loss": 1.1533,
|
|
"mean_token_accuracy": 0.721243005990982,
|
|
"num_tokens": 479223851.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.4773814404810715,
|
|
"grad_norm": 0.10832956377547402,
|
|
"learning_rate": 1.3474003099901368e-06,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.7130549490451813,
|
|
"num_tokens": 480122931.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.4787481208145414,
|
|
"grad_norm": 0.11489808892520535,
|
|
"learning_rate": 1.3438776948006201e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.7195812046527863,
|
|
"num_tokens": 481029820.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.4801148011480114,
|
|
"grad_norm": 0.11845772545591786,
|
|
"learning_rate": 1.3403550796111035e-06,
|
|
"loss": 1.1586,
|
|
"mean_token_accuracy": 0.7210663318634033,
|
|
"num_tokens": 481955081.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 0.11974194312337996,
|
|
"learning_rate": 1.3368324644215866e-06,
|
|
"loss": 1.1869,
|
|
"mean_token_accuracy": 0.7133110105991364,
|
|
"num_tokens": 482876786.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.4828481618149514,
|
|
"grad_norm": 0.11819263387768167,
|
|
"learning_rate": 1.3333098492320701e-06,
|
|
"loss": 1.1448,
|
|
"mean_token_accuracy": 0.7218520641326904,
|
|
"num_tokens": 483812583.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.4842148421484214,
|
|
"grad_norm": 0.12151576397115825,
|
|
"learning_rate": 1.3297872340425533e-06,
|
|
"loss": 1.2043,
|
|
"mean_token_accuracy": 0.7095657587051392,
|
|
"num_tokens": 484710407.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.4855815224818913,
|
|
"grad_norm": 0.12440992663647145,
|
|
"learning_rate": 1.3262646188530364e-06,
|
|
"loss": 1.2439,
|
|
"mean_token_accuracy": 0.7046646356582642,
|
|
"num_tokens": 485677090.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.4869482028153616,
|
|
"grad_norm": 0.13299395633006436,
|
|
"learning_rate": 1.32274200366352e-06,
|
|
"loss": 1.1621,
|
|
"mean_token_accuracy": 0.720768004655838,
|
|
"num_tokens": 486599145.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.4883148831488315,
|
|
"grad_norm": 0.1325360524736375,
|
|
"learning_rate": 1.319219388474003e-06,
|
|
"loss": 1.1533,
|
|
"mean_token_accuracy": 0.7215927183628082,
|
|
"num_tokens": 487479628.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.4896815634823015,
|
|
"grad_norm": 0.11517251803870836,
|
|
"learning_rate": 1.3156967732844866e-06,
|
|
"loss": 1.1996,
|
|
"mean_token_accuracy": 0.7126593589782715,
|
|
"num_tokens": 488413015.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.4910482438157715,
|
|
"grad_norm": 0.13275208171622108,
|
|
"learning_rate": 1.31217415809497e-06,
|
|
"loss": 1.1482,
|
|
"mean_token_accuracy": 0.7213903188705444,
|
|
"num_tokens": 489355230.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 1.216015625,
|
|
"epoch": 1.4924149241492415,
|
|
"grad_norm": 0.12581110237983914,
|
|
"learning_rate": 1.308651542905453e-06,
|
|
"loss": 1.2331,
|
|
"mean_token_accuracy": 0.702860289812088,
|
|
"num_tokens": 490334779.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.4937816044827115,
|
|
"grad_norm": 0.12177600073831027,
|
|
"learning_rate": 1.3051289277159366e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.7174063324928284,
|
|
"num_tokens": 491236942.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.4951482848161814,
|
|
"grad_norm": 0.12641061130104858,
|
|
"learning_rate": 1.3016063125264197e-06,
|
|
"loss": 1.1707,
|
|
"mean_token_accuracy": 0.7183970928192138,
|
|
"num_tokens": 492130177.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.4965149651496514,
|
|
"grad_norm": 0.1245941601679515,
|
|
"learning_rate": 1.2980836973369029e-06,
|
|
"loss": 1.1966,
|
|
"mean_token_accuracy": 0.7093767106533051,
|
|
"num_tokens": 493024953.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.4978816454831214,
|
|
"grad_norm": 0.12594653367165037,
|
|
"learning_rate": 1.2945610821473864e-06,
|
|
"loss": 1.1573,
|
|
"mean_token_accuracy": 0.7201925456523895,
|
|
"num_tokens": 493907081.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.4992483258165916,
|
|
"grad_norm": 0.13576429268627935,
|
|
"learning_rate": 1.2910384669578695e-06,
|
|
"loss": 1.1466,
|
|
"mean_token_accuracy": 0.7243741691112519,
|
|
"num_tokens": 494835150.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.5006150061500616,
|
|
"grad_norm": 0.13423582514218366,
|
|
"learning_rate": 1.2875158517683529e-06,
|
|
"loss": 1.165,
|
|
"mean_token_accuracy": 0.7178705155849456,
|
|
"num_tokens": 495729808.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 1.5019816864835316,
|
|
"grad_norm": 0.1163662013547518,
|
|
"learning_rate": 1.2839932365788362e-06,
|
|
"loss": 1.213,
|
|
"mean_token_accuracy": 0.7084574997425079,
|
|
"num_tokens": 496647261.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.5033483668170016,
|
|
"grad_norm": 0.106535076175349,
|
|
"learning_rate": 1.2804706213893195e-06,
|
|
"loss": 1.1901,
|
|
"mean_token_accuracy": 0.7130894482135772,
|
|
"num_tokens": 497587325.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.5047150471504716,
|
|
"grad_norm": 0.12459080069643369,
|
|
"learning_rate": 1.2769480061998029e-06,
|
|
"loss": 1.2233,
|
|
"mean_token_accuracy": 0.7085772037506104,
|
|
"num_tokens": 498514107.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.5060817274839415,
|
|
"grad_norm": 0.11252065185068638,
|
|
"learning_rate": 1.2734253910102862e-06,
|
|
"loss": 1.1634,
|
|
"mean_token_accuracy": 0.7188735485076905,
|
|
"num_tokens": 499473541.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.5074484078174115,
|
|
"grad_norm": 0.12066298719314511,
|
|
"learning_rate": 1.2699027758207693e-06,
|
|
"loss": 1.1832,
|
|
"mean_token_accuracy": 0.7142462730407715,
|
|
"num_tokens": 500390760.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 1.5088150881508815,
|
|
"grad_norm": 0.12521274859977202,
|
|
"learning_rate": 1.2663801606312529e-06,
|
|
"loss": 1.2373,
|
|
"mean_token_accuracy": 0.7032889425754547,
|
|
"num_tokens": 501384396.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.5101817684843515,
|
|
"grad_norm": 0.1287628931679287,
|
|
"learning_rate": 1.262857545441736e-06,
|
|
"loss": 1.1281,
|
|
"mean_token_accuracy": 0.7247558534145355,
|
|
"num_tokens": 502260086.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.5115484488178215,
|
|
"grad_norm": 0.1751744776783351,
|
|
"learning_rate": 1.2593349302522193e-06,
|
|
"loss": 1.1939,
|
|
"mean_token_accuracy": 0.7120347380638122,
|
|
"num_tokens": 503173427.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.5129151291512914,
|
|
"grad_norm": 0.12429304336821885,
|
|
"learning_rate": 1.2558123150627027e-06,
|
|
"loss": 1.2303,
|
|
"mean_token_accuracy": 0.7058331310749054,
|
|
"num_tokens": 504111494.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 1.215625,
|
|
"epoch": 1.5142818094847614,
|
|
"grad_norm": 0.12766192953607775,
|
|
"learning_rate": 1.252289699873186e-06,
|
|
"loss": 1.2295,
|
|
"mean_token_accuracy": 0.703783905506134,
|
|
"num_tokens": 505041071.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 1.5156484898182314,
|
|
"grad_norm": 0.17528407459361584,
|
|
"learning_rate": 1.2487670846836693e-06,
|
|
"loss": 1.2127,
|
|
"mean_token_accuracy": 0.7079283177852631,
|
|
"num_tokens": 505975462.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 1.27421875,
|
|
"epoch": 1.5170151701517014,
|
|
"grad_norm": 0.1462769106107419,
|
|
"learning_rate": 1.2452444694941527e-06,
|
|
"loss": 1.2854,
|
|
"mean_token_accuracy": 0.6950620114803314,
|
|
"num_tokens": 506866778.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.5183818504851714,
|
|
"grad_norm": 0.10583201991712385,
|
|
"learning_rate": 1.2417218543046358e-06,
|
|
"loss": 1.1949,
|
|
"mean_token_accuracy": 0.7143675148487091,
|
|
"num_tokens": 507789436.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.5197485308186414,
|
|
"grad_norm": 0.1652470786920535,
|
|
"learning_rate": 1.2381992391151191e-06,
|
|
"loss": 1.1693,
|
|
"mean_token_accuracy": 0.7182806015014649,
|
|
"num_tokens": 508699280.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.5211152111521116,
|
|
"grad_norm": 0.12210270971207664,
|
|
"learning_rate": 1.2346766239256025e-06,
|
|
"loss": 1.1921,
|
|
"mean_token_accuracy": 0.7116478860378266,
|
|
"num_tokens": 509649338.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.5224818914855816,
|
|
"grad_norm": 0.11969833622326821,
|
|
"learning_rate": 1.2311540087360858e-06,
|
|
"loss": 1.1744,
|
|
"mean_token_accuracy": 0.7164743363857269,
|
|
"num_tokens": 510612511.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.5238485718190515,
|
|
"grad_norm": 0.1381718752995938,
|
|
"learning_rate": 1.2276313935465691e-06,
|
|
"loss": 1.1424,
|
|
"mean_token_accuracy": 0.7236548244953156,
|
|
"num_tokens": 511529555.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 1.5252152521525215,
|
|
"grad_norm": 0.1254291738470363,
|
|
"learning_rate": 1.2241087783570525e-06,
|
|
"loss": 1.1576,
|
|
"mean_token_accuracy": 0.71940758228302,
|
|
"num_tokens": 512457097.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.5265819324859915,
|
|
"grad_norm": 0.11804423970278698,
|
|
"learning_rate": 1.2205861631675358e-06,
|
|
"loss": 1.1848,
|
|
"mean_token_accuracy": 0.7130425214767456,
|
|
"num_tokens": 513390116.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.5279486128194615,
|
|
"grad_norm": 0.12450935550364968,
|
|
"learning_rate": 1.217063547978019e-06,
|
|
"loss": 1.2019,
|
|
"mean_token_accuracy": 0.7105193793773651,
|
|
"num_tokens": 514298566.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.5293152931529317,
|
|
"grad_norm": 0.12534423131836586,
|
|
"learning_rate": 1.2135409327885023e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.715944242477417,
|
|
"num_tokens": 515216398.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.5306819734864017,
|
|
"grad_norm": 0.10990886825444784,
|
|
"learning_rate": 1.2100183175989856e-06,
|
|
"loss": 1.1634,
|
|
"mean_token_accuracy": 0.7188715994358063,
|
|
"num_tokens": 516116406.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.5320486538198717,
|
|
"grad_norm": 0.12074809131969544,
|
|
"learning_rate": 1.206495702409469e-06,
|
|
"loss": 1.145,
|
|
"mean_token_accuracy": 0.721521133184433,
|
|
"num_tokens": 517015399.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 1.112890625,
|
|
"epoch": 1.5334153341533416,
|
|
"grad_norm": 0.1415684690952763,
|
|
"learning_rate": 1.202973087219952e-06,
|
|
"loss": 1.1114,
|
|
"mean_token_accuracy": 0.7241316735744476,
|
|
"num_tokens": 517938846.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.5347820144868116,
|
|
"grad_norm": 0.14857777437066927,
|
|
"learning_rate": 1.1994504720304354e-06,
|
|
"loss": 1.1692,
|
|
"mean_token_accuracy": 0.7171053528785706,
|
|
"num_tokens": 518889200.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.5361486948202816,
|
|
"grad_norm": 0.1371976430259961,
|
|
"learning_rate": 1.1959278568409187e-06,
|
|
"loss": 1.1683,
|
|
"mean_token_accuracy": 0.7164720058441162,
|
|
"num_tokens": 519840847.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.5375153751537516,
|
|
"grad_norm": 0.12341534513843541,
|
|
"learning_rate": 1.192405241651402e-06,
|
|
"loss": 1.1509,
|
|
"mean_token_accuracy": 0.7184807300567627,
|
|
"num_tokens": 520763327.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 1.169140625,
|
|
"epoch": 1.5388820554872216,
|
|
"grad_norm": 0.11843276947998599,
|
|
"learning_rate": 1.1888826264618854e-06,
|
|
"loss": 1.1758,
|
|
"mean_token_accuracy": 0.7159298002719879,
|
|
"num_tokens": 521674767.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.5402487358206916,
|
|
"grad_norm": 0.12967491362738226,
|
|
"learning_rate": 1.1853600112723687e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7155235111713409,
|
|
"num_tokens": 522615184.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.5416154161541615,
|
|
"grad_norm": 0.13365349243475538,
|
|
"learning_rate": 1.181837396082852e-06,
|
|
"loss": 1.1564,
|
|
"mean_token_accuracy": 0.7195694983005524,
|
|
"num_tokens": 523552065.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.5429820964876315,
|
|
"grad_norm": 0.11904696327479557,
|
|
"learning_rate": 1.1783147808933352e-06,
|
|
"loss": 1.162,
|
|
"mean_token_accuracy": 0.718795895576477,
|
|
"num_tokens": 524480764.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 1.24453125,
|
|
"epoch": 1.5443487768211015,
|
|
"grad_norm": 0.12863574930858462,
|
|
"learning_rate": 1.1747921657038185e-06,
|
|
"loss": 1.2525,
|
|
"mean_token_accuracy": 0.700759333372116,
|
|
"num_tokens": 525402753.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.5457154571545715,
|
|
"grad_norm": 0.12277501657723819,
|
|
"learning_rate": 1.1712695505143019e-06,
|
|
"loss": 1.1387,
|
|
"mean_token_accuracy": 0.7214914560317993,
|
|
"num_tokens": 526299377.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.5470821374880415,
|
|
"grad_norm": 0.11847055615933152,
|
|
"learning_rate": 1.1677469353247852e-06,
|
|
"loss": 1.1959,
|
|
"mean_token_accuracy": 0.71388378739357,
|
|
"num_tokens": 527168284.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.5484488178215114,
|
|
"grad_norm": 0.11416579771568856,
|
|
"learning_rate": 1.1642243201352685e-06,
|
|
"loss": 1.209,
|
|
"mean_token_accuracy": 0.708658230304718,
|
|
"num_tokens": 528122686.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.5498154981549814,
|
|
"grad_norm": 0.13008757039748362,
|
|
"learning_rate": 1.1607017049457519e-06,
|
|
"loss": 1.1729,
|
|
"mean_token_accuracy": 0.7150499880313873,
|
|
"num_tokens": 529059416.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.5511821784884514,
|
|
"grad_norm": 0.12546122033372611,
|
|
"learning_rate": 1.1571790897562352e-06,
|
|
"loss": 1.1945,
|
|
"mean_token_accuracy": 0.7130601406097412,
|
|
"num_tokens": 529952654.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.5525488588219214,
|
|
"grad_norm": 0.14008889690642115,
|
|
"learning_rate": 1.1536564745667183e-06,
|
|
"loss": 1.1467,
|
|
"mean_token_accuracy": 0.7223858058452606,
|
|
"num_tokens": 530857931.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.5539155391553916,
|
|
"grad_norm": 0.1727895090508185,
|
|
"learning_rate": 1.1501338593772017e-06,
|
|
"loss": 1.1541,
|
|
"mean_token_accuracy": 0.7211673021316528,
|
|
"num_tokens": 531760313.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.5552822194888616,
|
|
"grad_norm": 0.12125180299532673,
|
|
"learning_rate": 1.146611244187685e-06,
|
|
"loss": 1.2151,
|
|
"mean_token_accuracy": 0.7117531657218933,
|
|
"num_tokens": 532709579.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.5566488998223316,
|
|
"grad_norm": 0.1200367238180539,
|
|
"learning_rate": 1.1430886289981683e-06,
|
|
"loss": 1.1902,
|
|
"mean_token_accuracy": 0.7105514407157898,
|
|
"num_tokens": 533627278.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 1.118359375,
|
|
"epoch": 1.5580155801558015,
|
|
"grad_norm": 0.1302240901039451,
|
|
"learning_rate": 1.1395660138086517e-06,
|
|
"loss": 1.1119,
|
|
"mean_token_accuracy": 0.727330493927002,
|
|
"num_tokens": 534490443.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.5593822604892715,
|
|
"grad_norm": 0.11328728167918889,
|
|
"learning_rate": 1.136043398619135e-06,
|
|
"loss": 1.1586,
|
|
"mean_token_accuracy": 0.7217346012592316,
|
|
"num_tokens": 535400143.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.5607489408227415,
|
|
"grad_norm": 0.1308578220458541,
|
|
"learning_rate": 1.1325207834296184e-06,
|
|
"loss": 1.214,
|
|
"mean_token_accuracy": 0.7085013926029206,
|
|
"num_tokens": 536342552.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.5621156211562117,
|
|
"grad_norm": 0.13699496327678348,
|
|
"learning_rate": 1.1289981682401017e-06,
|
|
"loss": 1.1736,
|
|
"mean_token_accuracy": 0.7163388550281524,
|
|
"num_tokens": 537229391.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 1.169921875,
|
|
"epoch": 1.5634823014896817,
|
|
"grad_norm": 0.11376287209059809,
|
|
"learning_rate": 1.1254755530505848e-06,
|
|
"loss": 1.1606,
|
|
"mean_token_accuracy": 0.7195038676261902,
|
|
"num_tokens": 538149167.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 1.20390625,
|
|
"epoch": 1.5648489818231517,
|
|
"grad_norm": 0.12294796774745699,
|
|
"learning_rate": 1.1219529378610681e-06,
|
|
"loss": 1.204,
|
|
"mean_token_accuracy": 0.708750170469284,
|
|
"num_tokens": 539069493.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.5662156621566217,
|
|
"grad_norm": 0.12409566167395614,
|
|
"learning_rate": 1.1184303226715515e-06,
|
|
"loss": 1.1958,
|
|
"mean_token_accuracy": 0.7133151292800903,
|
|
"num_tokens": 540011628.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.5675823424900917,
|
|
"grad_norm": 0.12418630547011432,
|
|
"learning_rate": 1.1149077074820346e-06,
|
|
"loss": 1.199,
|
|
"mean_token_accuracy": 0.7104664146900177,
|
|
"num_tokens": 540941914.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.5689490228235616,
|
|
"grad_norm": 0.12252248440062793,
|
|
"learning_rate": 1.111385092292518e-06,
|
|
"loss": 1.1999,
|
|
"mean_token_accuracy": 0.7132310688495636,
|
|
"num_tokens": 541852128.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.5703157031570316,
|
|
"grad_norm": 0.12295306723663087,
|
|
"learning_rate": 1.1078624771030013e-06,
|
|
"loss": 1.1805,
|
|
"mean_token_accuracy": 0.7159697353839874,
|
|
"num_tokens": 542772682.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.5716823834905016,
|
|
"grad_norm": 0.1720797716543464,
|
|
"learning_rate": 1.1043398619134846e-06,
|
|
"loss": 1.2006,
|
|
"mean_token_accuracy": 0.7098045527935029,
|
|
"num_tokens": 543733606.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 1.5730490638239716,
|
|
"grad_norm": 0.1136252754996673,
|
|
"learning_rate": 1.100817246723968e-06,
|
|
"loss": 1.2073,
|
|
"mean_token_accuracy": 0.7100187182426453,
|
|
"num_tokens": 544639316.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 1.20234375,
|
|
"epoch": 1.5744157441574416,
|
|
"grad_norm": 0.11673801186425999,
|
|
"learning_rate": 1.0972946315344513e-06,
|
|
"loss": 1.2186,
|
|
"mean_token_accuracy": 0.7099650502204895,
|
|
"num_tokens": 545599327.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.5757824244909115,
|
|
"grad_norm": 0.11996388538195268,
|
|
"learning_rate": 1.0937720163449346e-06,
|
|
"loss": 1.1668,
|
|
"mean_token_accuracy": 0.7191318333148956,
|
|
"num_tokens": 546504706.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.5771491048243815,
|
|
"grad_norm": 0.12245220435392648,
|
|
"learning_rate": 1.0902494011554177e-06,
|
|
"loss": 1.1536,
|
|
"mean_token_accuracy": 0.7184059321880341,
|
|
"num_tokens": 547439133.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.5785157851578515,
|
|
"grad_norm": 0.14884192612823277,
|
|
"learning_rate": 1.086726785965901e-06,
|
|
"loss": 1.1645,
|
|
"mean_token_accuracy": 0.718564248085022,
|
|
"num_tokens": 548402141.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 1.5798824654913215,
|
|
"grad_norm": 0.12535305203112507,
|
|
"learning_rate": 1.0832041707763844e-06,
|
|
"loss": 1.2111,
|
|
"mean_token_accuracy": 0.7106125593185425,
|
|
"num_tokens": 549328705.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.5812491458247915,
|
|
"grad_norm": 0.11881294449806738,
|
|
"learning_rate": 1.0796815555868678e-06,
|
|
"loss": 1.2104,
|
|
"mean_token_accuracy": 0.7120871126651764,
|
|
"num_tokens": 550311338.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 1.2234375,
|
|
"epoch": 1.5826158261582615,
|
|
"grad_norm": 0.11376733332811201,
|
|
"learning_rate": 1.076158940397351e-06,
|
|
"loss": 1.2354,
|
|
"mean_token_accuracy": 0.7043293535709381,
|
|
"num_tokens": 551238513.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.5839825064917314,
|
|
"grad_norm": 0.11087867568838482,
|
|
"learning_rate": 1.0726363252078344e-06,
|
|
"loss": 1.1678,
|
|
"mean_token_accuracy": 0.7180234313011169,
|
|
"num_tokens": 552127989.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.5853491868252014,
|
|
"grad_norm": 0.11724991805278914,
|
|
"learning_rate": 1.0691137100183178e-06,
|
|
"loss": 1.1453,
|
|
"mean_token_accuracy": 0.7231273293495178,
|
|
"num_tokens": 553036519.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.5867158671586716,
|
|
"grad_norm": 0.1277102542812985,
|
|
"learning_rate": 1.065591094828801e-06,
|
|
"loss": 1.1564,
|
|
"mean_token_accuracy": 0.7175122439861298,
|
|
"num_tokens": 554010433.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.5880825474921416,
|
|
"grad_norm": 0.13993611288832214,
|
|
"learning_rate": 1.0620684796392842e-06,
|
|
"loss": 1.1757,
|
|
"mean_token_accuracy": 0.7152313768863678,
|
|
"num_tokens": 554876950.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 1.128515625,
|
|
"epoch": 1.5894492278256116,
|
|
"grad_norm": 0.12649593093764266,
|
|
"learning_rate": 1.0585458644497676e-06,
|
|
"loss": 1.122,
|
|
"mean_token_accuracy": 0.7252063572406768,
|
|
"num_tokens": 555757487.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 1.20703125,
|
|
"epoch": 1.5908159081590816,
|
|
"grad_norm": 0.1576292802089691,
|
|
"learning_rate": 1.0550232492602509e-06,
|
|
"loss": 1.2126,
|
|
"mean_token_accuracy": 0.7078076779842377,
|
|
"num_tokens": 556662801.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.5921825884925516,
|
|
"grad_norm": 0.11904782187006918,
|
|
"learning_rate": 1.0515006340707342e-06,
|
|
"loss": 1.1862,
|
|
"mean_token_accuracy": 0.7146446526050567,
|
|
"num_tokens": 557566050.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.5935492688260215,
|
|
"grad_norm": 0.13334386577658044,
|
|
"learning_rate": 1.0479780188812176e-06,
|
|
"loss": 1.1328,
|
|
"mean_token_accuracy": 0.7253441452980042,
|
|
"num_tokens": 558439548.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.5949159491594918,
|
|
"grad_norm": 0.1453931226126289,
|
|
"learning_rate": 1.0444554036917009e-06,
|
|
"loss": 1.2013,
|
|
"mean_token_accuracy": 0.7111631572246552,
|
|
"num_tokens": 559371823.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.5962826294929617,
|
|
"grad_norm": 0.11819074487873452,
|
|
"learning_rate": 1.0409327885021842e-06,
|
|
"loss": 1.2266,
|
|
"mean_token_accuracy": 0.703775840997696,
|
|
"num_tokens": 560310399.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.5976493098264317,
|
|
"grad_norm": 0.11588531776443836,
|
|
"learning_rate": 1.0374101733126674e-06,
|
|
"loss": 1.1614,
|
|
"mean_token_accuracy": 0.7190374433994293,
|
|
"num_tokens": 561230216.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.5990159901599017,
|
|
"grad_norm": 0.12842651382130144,
|
|
"learning_rate": 1.0338875581231507e-06,
|
|
"loss": 1.1293,
|
|
"mean_token_accuracy": 0.7241967439651489,
|
|
"num_tokens": 562091149.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 1.228125,
|
|
"epoch": 1.6003826704933717,
|
|
"grad_norm": 0.12115661226514919,
|
|
"learning_rate": 1.030364942933634e-06,
|
|
"loss": 1.2329,
|
|
"mean_token_accuracy": 0.7034363329410553,
|
|
"num_tokens": 562987687.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 1.18359375,
|
|
"epoch": 1.6017493508268417,
|
|
"grad_norm": 0.10615155363342729,
|
|
"learning_rate": 1.0268423277441174e-06,
|
|
"loss": 1.1875,
|
|
"mean_token_accuracy": 0.7133141696453095,
|
|
"num_tokens": 563875852.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.6031160311603116,
|
|
"grad_norm": 0.14262960818849826,
|
|
"learning_rate": 1.0233197125546005e-06,
|
|
"loss": 1.1558,
|
|
"mean_token_accuracy": 0.7213080763816834,
|
|
"num_tokens": 564742159.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.6044827114937816,
|
|
"grad_norm": 0.12471560143947721,
|
|
"learning_rate": 1.0197970973650838e-06,
|
|
"loss": 1.1935,
|
|
"mean_token_accuracy": 0.7119753062725067,
|
|
"num_tokens": 565667534.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.6058493918272516,
|
|
"grad_norm": 0.12584546833462207,
|
|
"learning_rate": 1.0162744821755674e-06,
|
|
"loss": 1.1994,
|
|
"mean_token_accuracy": 0.7124448418617249,
|
|
"num_tokens": 566601480.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.6072160721607216,
|
|
"grad_norm": 0.1229345574834371,
|
|
"learning_rate": 1.0127518669860505e-06,
|
|
"loss": 1.168,
|
|
"mean_token_accuracy": 0.7179634988307952,
|
|
"num_tokens": 567516706.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 1.21875,
|
|
"epoch": 1.6085827524941916,
|
|
"grad_norm": 0.13193728250906542,
|
|
"learning_rate": 1.0092292517965338e-06,
|
|
"loss": 1.2188,
|
|
"mean_token_accuracy": 0.709744930267334,
|
|
"num_tokens": 568465666.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.6099494328276616,
|
|
"grad_norm": 0.13060102801441084,
|
|
"learning_rate": 1.0057066366070172e-06,
|
|
"loss": 1.1971,
|
|
"mean_token_accuracy": 0.7124235153198242,
|
|
"num_tokens": 569379426.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.6113161131611315,
|
|
"grad_norm": 0.11312586140272272,
|
|
"learning_rate": 1.0021840214175005e-06,
|
|
"loss": 1.1306,
|
|
"mean_token_accuracy": 0.724940836429596,
|
|
"num_tokens": 570290881.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 1.1890625,
|
|
"epoch": 1.6126827934946015,
|
|
"grad_norm": 0.13237222354794195,
|
|
"learning_rate": 9.986614062279836e-07,
|
|
"loss": 1.1979,
|
|
"mean_token_accuracy": 0.7134689688682556,
|
|
"num_tokens": 571225452.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.6140494738280715,
|
|
"grad_norm": 0.1314788080511237,
|
|
"learning_rate": 9.95138791038467e-07,
|
|
"loss": 1.126,
|
|
"mean_token_accuracy": 0.7254875302314758,
|
|
"num_tokens": 572157913.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 1.153125,
|
|
"epoch": 1.6154161541615415,
|
|
"grad_norm": 0.12377737580062767,
|
|
"learning_rate": 9.916161758489503e-07,
|
|
"loss": 1.1466,
|
|
"mean_token_accuracy": 0.7230316638946533,
|
|
"num_tokens": 573080644.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 1.6167828344950115,
|
|
"grad_norm": 0.13048314689557292,
|
|
"learning_rate": 9.880935606594336e-07,
|
|
"loss": 1.2159,
|
|
"mean_token_accuracy": 0.7084046185016633,
|
|
"num_tokens": 574058248.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 1.1734375,
|
|
"epoch": 1.6181495148284815,
|
|
"grad_norm": 0.10898693323771001,
|
|
"learning_rate": 9.84570945469917e-07,
|
|
"loss": 1.1787,
|
|
"mean_token_accuracy": 0.7169816553592682,
|
|
"num_tokens": 574972001.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 1.6195161951619517,
|
|
"grad_norm": 0.13163049731432266,
|
|
"learning_rate": 9.810483302804003e-07,
|
|
"loss": 1.2011,
|
|
"mean_token_accuracy": 0.7103736281394959,
|
|
"num_tokens": 575886825.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.6208828754954216,
|
|
"grad_norm": 0.11836003341177831,
|
|
"learning_rate": 9.775257150908836e-07,
|
|
"loss": 1.176,
|
|
"mean_token_accuracy": 0.7155143082141876,
|
|
"num_tokens": 576782871.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.6222495558288916,
|
|
"grad_norm": 0.13395160947015552,
|
|
"learning_rate": 9.740030999013668e-07,
|
|
"loss": 1.185,
|
|
"mean_token_accuracy": 0.7139827132225036,
|
|
"num_tokens": 577679960.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.6236162361623616,
|
|
"grad_norm": 0.11313529154599998,
|
|
"learning_rate": 9.7048048471185e-07,
|
|
"loss": 1.1417,
|
|
"mean_token_accuracy": 0.7195683360099793,
|
|
"num_tokens": 578575401.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.6249829164958316,
|
|
"grad_norm": 0.10830784833390551,
|
|
"learning_rate": 9.669578695223334e-07,
|
|
"loss": 1.2053,
|
|
"mean_token_accuracy": 0.7111427426338196,
|
|
"num_tokens": 579514730.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 1.6263495968293016,
|
|
"grad_norm": 0.18091908299344037,
|
|
"learning_rate": 9.634352543328168e-07,
|
|
"loss": 1.176,
|
|
"mean_token_accuracy": 0.7151439547538757,
|
|
"num_tokens": 580408496.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.6277162771627718,
|
|
"grad_norm": 0.11862713970228235,
|
|
"learning_rate": 9.599126391433e-07,
|
|
"loss": 1.1815,
|
|
"mean_token_accuracy": 0.7135456800460815,
|
|
"num_tokens": 581329631.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 1.6290829574962418,
|
|
"grad_norm": 0.11668917836437687,
|
|
"learning_rate": 9.563900239537834e-07,
|
|
"loss": 1.2104,
|
|
"mean_token_accuracy": 0.7086131811141968,
|
|
"num_tokens": 582238004.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.6304496378297118,
|
|
"grad_norm": 0.12252118843294864,
|
|
"learning_rate": 9.528674087642667e-07,
|
|
"loss": 1.1517,
|
|
"mean_token_accuracy": 0.720429652929306,
|
|
"num_tokens": 583184118.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 1.6318163181631817,
|
|
"grad_norm": 0.30559934829065144,
|
|
"learning_rate": 9.4934479357475e-07,
|
|
"loss": 1.1451,
|
|
"mean_token_accuracy": 0.7230360031127929,
|
|
"num_tokens": 584112789.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 1.2109375,
|
|
"epoch": 1.6331829984966517,
|
|
"grad_norm": 0.12871711557517418,
|
|
"learning_rate": 9.458221783852332e-07,
|
|
"loss": 1.2067,
|
|
"mean_token_accuracy": 0.7086722910404205,
|
|
"num_tokens": 585074954.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.6345496788301217,
|
|
"grad_norm": 0.1539507248719715,
|
|
"learning_rate": 9.422995631957166e-07,
|
|
"loss": 1.1591,
|
|
"mean_token_accuracy": 0.7194820523262024,
|
|
"num_tokens": 585957971.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 1.2140625,
|
|
"epoch": 1.6359163591635917,
|
|
"grad_norm": 0.11993628221451304,
|
|
"learning_rate": 9.387769480061999e-07,
|
|
"loss": 1.2152,
|
|
"mean_token_accuracy": 0.7095519065856933,
|
|
"num_tokens": 586873047.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 1.6372830394970617,
|
|
"grad_norm": 0.11809248619078255,
|
|
"learning_rate": 9.352543328166831e-07,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.717840576171875,
|
|
"num_tokens": 587833433.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.6386497198305316,
|
|
"grad_norm": 0.16756964690074744,
|
|
"learning_rate": 9.317317176271665e-07,
|
|
"loss": 1.1912,
|
|
"mean_token_accuracy": 0.7138761222362519,
|
|
"num_tokens": 588711143.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 1.21171875,
|
|
"epoch": 1.6400164001640016,
|
|
"grad_norm": 0.14233888345571724,
|
|
"learning_rate": 9.282091024376498e-07,
|
|
"loss": 1.2233,
|
|
"mean_token_accuracy": 0.7048332691192627,
|
|
"num_tokens": 589696087.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 1.2328125,
|
|
"epoch": 1.6413830804974716,
|
|
"grad_norm": 0.13576776487022665,
|
|
"learning_rate": 9.246864872481331e-07,
|
|
"loss": 1.2382,
|
|
"mean_token_accuracy": 0.7016281366348267,
|
|
"num_tokens": 590586600.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 1.17265625,
|
|
"epoch": 1.6427497608309416,
|
|
"grad_norm": 0.12309963153757253,
|
|
"learning_rate": 9.211638720586164e-07,
|
|
"loss": 1.1733,
|
|
"mean_token_accuracy": 0.715294623374939,
|
|
"num_tokens": 591468037.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.6441164411644116,
|
|
"grad_norm": 0.12338502352443095,
|
|
"learning_rate": 9.176412568690997e-07,
|
|
"loss": 1.1419,
|
|
"mean_token_accuracy": 0.722267496585846,
|
|
"num_tokens": 592386274.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 1.197265625,
|
|
"epoch": 1.6454831214978816,
|
|
"grad_norm": 0.12103051037389216,
|
|
"learning_rate": 9.14118641679583e-07,
|
|
"loss": 1.1997,
|
|
"mean_token_accuracy": 0.7104038298130035,
|
|
"num_tokens": 593328423.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.6468498018313515,
|
|
"grad_norm": 0.13807464270416792,
|
|
"learning_rate": 9.105960264900663e-07,
|
|
"loss": 1.1501,
|
|
"mean_token_accuracy": 0.7194877624511719,
|
|
"num_tokens": 594203300.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.6482164821648215,
|
|
"grad_norm": 0.12078496914106891,
|
|
"learning_rate": 9.070734113005496e-07,
|
|
"loss": 1.1829,
|
|
"mean_token_accuracy": 0.7149824261665344,
|
|
"num_tokens": 595167305.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 1.2359375,
|
|
"epoch": 1.6495831624982915,
|
|
"grad_norm": 0.1145951397691158,
|
|
"learning_rate": 9.035507961110329e-07,
|
|
"loss": 1.2385,
|
|
"mean_token_accuracy": 0.7051996469497681,
|
|
"num_tokens": 596075156.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.6509498428317615,
|
|
"grad_norm": 0.13166029403291538,
|
|
"learning_rate": 9.000281809215163e-07,
|
|
"loss": 1.1729,
|
|
"mean_token_accuracy": 0.7146727323532105,
|
|
"num_tokens": 596987265.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.6523165231652317,
|
|
"grad_norm": 0.11794681243965713,
|
|
"learning_rate": 8.965055657319995e-07,
|
|
"loss": 1.1392,
|
|
"mean_token_accuracy": 0.7244709730148315,
|
|
"num_tokens": 597900554.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.6536832034987017,
|
|
"grad_norm": 0.13414826739239818,
|
|
"learning_rate": 8.929829505424828e-07,
|
|
"loss": 1.1648,
|
|
"mean_token_accuracy": 0.7168895840644837,
|
|
"num_tokens": 598822687.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.6550498838321717,
|
|
"grad_norm": 0.13703944999319423,
|
|
"learning_rate": 8.894603353529662e-07,
|
|
"loss": 1.1458,
|
|
"mean_token_accuracy": 0.7213573098182678,
|
|
"num_tokens": 599736206.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.6564165641656416,
|
|
"grad_norm": 0.10972058375978551,
|
|
"learning_rate": 8.859377201634495e-07,
|
|
"loss": 1.1683,
|
|
"mean_token_accuracy": 0.7176376223564148,
|
|
"num_tokens": 600697415.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.6577832444991116,
|
|
"grad_norm": 0.11394034852450605,
|
|
"learning_rate": 8.824151049739326e-07,
|
|
"loss": 1.2149,
|
|
"mean_token_accuracy": 0.7066365659236908,
|
|
"num_tokens": 601632923.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 1.1421875,
|
|
"epoch": 1.6591499248325816,
|
|
"grad_norm": 0.12485819444210548,
|
|
"learning_rate": 8.788924897844161e-07,
|
|
"loss": 1.1524,
|
|
"mean_token_accuracy": 0.7203721940517426,
|
|
"num_tokens": 602585436.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.6605166051660518,
|
|
"grad_norm": 0.12171012169101336,
|
|
"learning_rate": 8.753698745948994e-07,
|
|
"loss": 1.1358,
|
|
"mean_token_accuracy": 0.7234384834766387,
|
|
"num_tokens": 603533298.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 1.14296875,
|
|
"epoch": 1.6618832854995218,
|
|
"grad_norm": 0.1268024714886373,
|
|
"learning_rate": 8.718472594053825e-07,
|
|
"loss": 1.1318,
|
|
"mean_token_accuracy": 0.7242269515991211,
|
|
"num_tokens": 604406463.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.6632499658329918,
|
|
"grad_norm": 0.16071071559267572,
|
|
"learning_rate": 8.683246442158659e-07,
|
|
"loss": 1.1701,
|
|
"mean_token_accuracy": 0.7176785945892334,
|
|
"num_tokens": 605302034.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 1.1390625,
|
|
"epoch": 1.6646166461664618,
|
|
"grad_norm": 0.12048296011077814,
|
|
"learning_rate": 8.648020290263492e-07,
|
|
"loss": 1.1435,
|
|
"mean_token_accuracy": 0.7220817148685456,
|
|
"num_tokens": 606180470.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.6659833264999317,
|
|
"grad_norm": 0.16662582113432498,
|
|
"learning_rate": 8.612794138368325e-07,
|
|
"loss": 1.185,
|
|
"mean_token_accuracy": 0.7157534420490265,
|
|
"num_tokens": 607113987.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.6673500068334017,
|
|
"grad_norm": 0.13557808539691402,
|
|
"learning_rate": 8.577567986473158e-07,
|
|
"loss": 1.1988,
|
|
"mean_token_accuracy": 0.712661623954773,
|
|
"num_tokens": 608002763.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.6687166871668717,
|
|
"grad_norm": 0.12279880799768375,
|
|
"learning_rate": 8.542341834577991e-07,
|
|
"loss": 1.1605,
|
|
"mean_token_accuracy": 0.7185430526733398,
|
|
"num_tokens": 608904426.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.6700833675003417,
|
|
"grad_norm": 0.12301740490565641,
|
|
"learning_rate": 8.507115682682824e-07,
|
|
"loss": 1.1411,
|
|
"mean_token_accuracy": 0.7222116827964783,
|
|
"num_tokens": 609805211.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 1.6714500478338117,
|
|
"grad_norm": 0.1414140586131915,
|
|
"learning_rate": 8.471889530787658e-07,
|
|
"loss": 1.2356,
|
|
"mean_token_accuracy": 0.7060823857784271,
|
|
"num_tokens": 610760760.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.6728167281672817,
|
|
"grad_norm": 0.11984062688792792,
|
|
"learning_rate": 8.43666337889249e-07,
|
|
"loss": 1.1703,
|
|
"mean_token_accuracy": 0.7162910640239716,
|
|
"num_tokens": 611668588.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 1.175390625,
|
|
"epoch": 1.6741834085007516,
|
|
"grad_norm": 0.11984777290867094,
|
|
"learning_rate": 8.401437226997323e-07,
|
|
"loss": 1.183,
|
|
"mean_token_accuracy": 0.7170411288738251,
|
|
"num_tokens": 612608531.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 1.160546875,
|
|
"epoch": 1.6755500888342216,
|
|
"grad_norm": 0.11616375247609867,
|
|
"learning_rate": 8.366211075102157e-07,
|
|
"loss": 1.161,
|
|
"mean_token_accuracy": 0.7187194526195526,
|
|
"num_tokens": 613470447.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.6769167691676916,
|
|
"grad_norm": 0.15059444520307713,
|
|
"learning_rate": 8.330984923206989e-07,
|
|
"loss": 1.1559,
|
|
"mean_token_accuracy": 0.7207857370376587,
|
|
"num_tokens": 614415521.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.6782834495011616,
|
|
"grad_norm": 0.13535442873434134,
|
|
"learning_rate": 8.295758771311822e-07,
|
|
"loss": 1.2325,
|
|
"mean_token_accuracy": 0.7042860150337219,
|
|
"num_tokens": 615281666.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.6796501298346316,
|
|
"grad_norm": 0.11256698278717957,
|
|
"learning_rate": 8.260532619416656e-07,
|
|
"loss": 1.1951,
|
|
"mean_token_accuracy": 0.7122718334197998,
|
|
"num_tokens": 616170270.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 1.178515625,
|
|
"epoch": 1.6810168101681016,
|
|
"grad_norm": 0.11284913889709669,
|
|
"learning_rate": 8.225306467521489e-07,
|
|
"loss": 1.175,
|
|
"mean_token_accuracy": 0.7172844529151916,
|
|
"num_tokens": 617090523.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 1.6823834905015715,
|
|
"grad_norm": 0.13795378573418868,
|
|
"learning_rate": 8.190080315626321e-07,
|
|
"loss": 1.2085,
|
|
"mean_token_accuracy": 0.710912561416626,
|
|
"num_tokens": 617980681.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 1.6837501708350415,
|
|
"grad_norm": 0.11134777584189458,
|
|
"learning_rate": 8.154854163731155e-07,
|
|
"loss": 1.1852,
|
|
"mean_token_accuracy": 0.7151150286197663,
|
|
"num_tokens": 618928018.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.6851168511685117,
|
|
"grad_norm": 0.1347051641836959,
|
|
"learning_rate": 8.119628011835988e-07,
|
|
"loss": 1.1898,
|
|
"mean_token_accuracy": 0.7111489117145539,
|
|
"num_tokens": 619835872.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 1.165234375,
|
|
"epoch": 1.6864835315019817,
|
|
"grad_norm": 0.12386859166158753,
|
|
"learning_rate": 8.084401859940821e-07,
|
|
"loss": 1.1653,
|
|
"mean_token_accuracy": 0.7183009922504425,
|
|
"num_tokens": 620728761.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 1.2046875,
|
|
"epoch": 1.6878502118354517,
|
|
"grad_norm": 0.11692854457111296,
|
|
"learning_rate": 8.049175708045654e-07,
|
|
"loss": 1.2228,
|
|
"mean_token_accuracy": 0.7070809543132782,
|
|
"num_tokens": 621663457.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.6892168921689217,
|
|
"grad_norm": 0.11625852396498697,
|
|
"learning_rate": 8.013949556150487e-07,
|
|
"loss": 1.1364,
|
|
"mean_token_accuracy": 0.7216257333755494,
|
|
"num_tokens": 622534500.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.6905835725023917,
|
|
"grad_norm": 0.12408622808834331,
|
|
"learning_rate": 7.97872340425532e-07,
|
|
"loss": 1.148,
|
|
"mean_token_accuracy": 0.7202049493789673,
|
|
"num_tokens": 623463196.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.6919502528358616,
|
|
"grad_norm": 0.13639208275231338,
|
|
"learning_rate": 7.943497252360153e-07,
|
|
"loss": 1.1905,
|
|
"mean_token_accuracy": 0.7111760079860687,
|
|
"num_tokens": 624360842.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 1.19921875,
|
|
"epoch": 1.6933169331693319,
|
|
"grad_norm": 0.11948404623925347,
|
|
"learning_rate": 7.908271100464986e-07,
|
|
"loss": 1.2063,
|
|
"mean_token_accuracy": 0.7089051127433776,
|
|
"num_tokens": 625317942.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.6946836135028018,
|
|
"grad_norm": 0.14369897017922048,
|
|
"learning_rate": 7.873044948569819e-07,
|
|
"loss": 1.2057,
|
|
"mean_token_accuracy": 0.7121761739253998,
|
|
"num_tokens": 626241375.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.6960502938362718,
|
|
"grad_norm": 0.128170347244973,
|
|
"learning_rate": 7.837818796674653e-07,
|
|
"loss": 1.1721,
|
|
"mean_token_accuracy": 0.7184717535972596,
|
|
"num_tokens": 627167442.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 1.6974169741697418,
|
|
"grad_norm": 0.13831492134545278,
|
|
"learning_rate": 7.802592644779484e-07,
|
|
"loss": 1.1795,
|
|
"mean_token_accuracy": 0.7139018714427948,
|
|
"num_tokens": 628127855.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.6987836545032118,
|
|
"grad_norm": 0.1312393915984031,
|
|
"learning_rate": 7.767366492884317e-07,
|
|
"loss": 1.1984,
|
|
"mean_token_accuracy": 0.7111853718757629,
|
|
"num_tokens": 629033420.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.7001503348366818,
|
|
"grad_norm": 0.11705228476882654,
|
|
"learning_rate": 7.732140340989152e-07,
|
|
"loss": 1.1795,
|
|
"mean_token_accuracy": 0.7151196479797364,
|
|
"num_tokens": 629992592.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.7015170151701517,
|
|
"grad_norm": 0.13182598757062752,
|
|
"learning_rate": 7.696914189093985e-07,
|
|
"loss": 1.1918,
|
|
"mean_token_accuracy": 0.7133052468299865,
|
|
"num_tokens": 630909709.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 1.14921875,
|
|
"epoch": 1.7028836955036217,
|
|
"grad_norm": 0.11182811137966393,
|
|
"learning_rate": 7.661688037198816e-07,
|
|
"loss": 1.1475,
|
|
"mean_token_accuracy": 0.7198295414447784,
|
|
"num_tokens": 631845165.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 1.134375,
|
|
"epoch": 1.7042503758370917,
|
|
"grad_norm": 0.11712784472045053,
|
|
"learning_rate": 7.62646188530365e-07,
|
|
"loss": 1.14,
|
|
"mean_token_accuracy": 0.7221791565418243,
|
|
"num_tokens": 632753221.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.7056170561705617,
|
|
"grad_norm": 0.11352978016690028,
|
|
"learning_rate": 7.591235733408483e-07,
|
|
"loss": 1.1395,
|
|
"mean_token_accuracy": 0.7225068151950836,
|
|
"num_tokens": 633683875.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 1.13671875,
|
|
"epoch": 1.7069837365040317,
|
|
"grad_norm": 0.13248854431661297,
|
|
"learning_rate": 7.556009581513315e-07,
|
|
"loss": 1.1235,
|
|
"mean_token_accuracy": 0.7242291092872619,
|
|
"num_tokens": 634598153.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 1.11796875,
|
|
"epoch": 1.7083504168375017,
|
|
"grad_norm": 0.108368489274951,
|
|
"learning_rate": 7.520783429618149e-07,
|
|
"loss": 1.1331,
|
|
"mean_token_accuracy": 0.7250804245471955,
|
|
"num_tokens": 635548372.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.7097170971709716,
|
|
"grad_norm": 0.11213949996577716,
|
|
"learning_rate": 7.485557277722982e-07,
|
|
"loss": 1.1613,
|
|
"mean_token_accuracy": 0.719782167673111,
|
|
"num_tokens": 636476416.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.7110837775044416,
|
|
"grad_norm": 0.13282848382594073,
|
|
"learning_rate": 7.450331125827815e-07,
|
|
"loss": 1.1754,
|
|
"mean_token_accuracy": 0.7149587213993073,
|
|
"num_tokens": 637406095.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.7124504578379116,
|
|
"grad_norm": 0.127932808336916,
|
|
"learning_rate": 7.415104973932648e-07,
|
|
"loss": 1.159,
|
|
"mean_token_accuracy": 0.7183066070079803,
|
|
"num_tokens": 638341278.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.7138171381713816,
|
|
"grad_norm": 0.14111909692320102,
|
|
"learning_rate": 7.379878822037481e-07,
|
|
"loss": 1.1324,
|
|
"mean_token_accuracy": 0.7236042141914367,
|
|
"num_tokens": 639203270.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 1.21015625,
|
|
"epoch": 1.7151838185048516,
|
|
"grad_norm": 0.11792130464855312,
|
|
"learning_rate": 7.344652670142314e-07,
|
|
"loss": 1.2065,
|
|
"mean_token_accuracy": 0.7105764806270599,
|
|
"num_tokens": 640129089.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 1.7165504988383216,
|
|
"grad_norm": 0.14891352945972297,
|
|
"learning_rate": 7.309426518247147e-07,
|
|
"loss": 1.1983,
|
|
"mean_token_accuracy": 0.7123305857181549,
|
|
"num_tokens": 641077162.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.7179171791717918,
|
|
"grad_norm": 0.11400654796031481,
|
|
"learning_rate": 7.27420036635198e-07,
|
|
"loss": 1.1766,
|
|
"mean_token_accuracy": 0.7163540899753571,
|
|
"num_tokens": 641975828.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 1.7192838595052617,
|
|
"grad_norm": 0.1927226836104978,
|
|
"learning_rate": 7.238974214456813e-07,
|
|
"loss": 1.224,
|
|
"mean_token_accuracy": 0.7047651946544647,
|
|
"num_tokens": 642924654.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.7206505398387317,
|
|
"grad_norm": 0.14476400216790497,
|
|
"learning_rate": 7.203748062561647e-07,
|
|
"loss": 1.1787,
|
|
"mean_token_accuracy": 0.7172920048236847,
|
|
"num_tokens": 643911925.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.7220172201722017,
|
|
"grad_norm": 0.11233489096794923,
|
|
"learning_rate": 7.168521910666479e-07,
|
|
"loss": 1.1805,
|
|
"mean_token_accuracy": 0.715908682346344,
|
|
"num_tokens": 644826729.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.7233839005056717,
|
|
"grad_norm": 0.1180687653664523,
|
|
"learning_rate": 7.133295758771312e-07,
|
|
"loss": 1.1779,
|
|
"mean_token_accuracy": 0.7147654056549072,
|
|
"num_tokens": 645748013.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.7247505808391417,
|
|
"grad_norm": 0.13951137961582075,
|
|
"learning_rate": 7.098069606876146e-07,
|
|
"loss": 1.1532,
|
|
"mean_token_accuracy": 0.7204329192638397,
|
|
"num_tokens": 646658575.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.7261172611726119,
|
|
"grad_norm": 0.12156939288851959,
|
|
"learning_rate": 7.062843454980979e-07,
|
|
"loss": 1.1612,
|
|
"mean_token_accuracy": 0.7207218706607819,
|
|
"num_tokens": 647550180.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 1.13984375,
|
|
"epoch": 1.7274839415060819,
|
|
"grad_norm": 0.11518066135405713,
|
|
"learning_rate": 7.027617303085811e-07,
|
|
"loss": 1.1434,
|
|
"mean_token_accuracy": 0.7231364846229553,
|
|
"num_tokens": 648484164.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.7288506218395518,
|
|
"grad_norm": 0.13312837325541096,
|
|
"learning_rate": 6.992391151190645e-07,
|
|
"loss": 1.1878,
|
|
"mean_token_accuracy": 0.7149413168430329,
|
|
"num_tokens": 649411592.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.7302173021730218,
|
|
"grad_norm": 0.15023680259629732,
|
|
"learning_rate": 6.957164999295478e-07,
|
|
"loss": 1.2246,
|
|
"mean_token_accuracy": 0.70513516664505,
|
|
"num_tokens": 650289609.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.7315839825064918,
|
|
"grad_norm": 0.11734213255765893,
|
|
"learning_rate": 6.921938847400309e-07,
|
|
"loss": 1.1618,
|
|
"mean_token_accuracy": 0.7163570106029511,
|
|
"num_tokens": 651168542.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.7329506628399618,
|
|
"grad_norm": 0.12283549885449853,
|
|
"learning_rate": 6.886712695505144e-07,
|
|
"loss": 1.1789,
|
|
"mean_token_accuracy": 0.7169443190097808,
|
|
"num_tokens": 652112587.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.7343173431734318,
|
|
"grad_norm": 0.12134655423039796,
|
|
"learning_rate": 6.851486543609977e-07,
|
|
"loss": 1.2372,
|
|
"mean_token_accuracy": 0.7034136295318604,
|
|
"num_tokens": 653046235.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.7356840235069018,
|
|
"grad_norm": 0.12580737319138782,
|
|
"learning_rate": 6.816260391714811e-07,
|
|
"loss": 1.1934,
|
|
"mean_token_accuracy": 0.7120255947113037,
|
|
"num_tokens": 653955381.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.7370507038403717,
|
|
"grad_norm": 0.12903470378929124,
|
|
"learning_rate": 6.781034239819642e-07,
|
|
"loss": 1.1864,
|
|
"mean_token_accuracy": 0.7126413106918335,
|
|
"num_tokens": 654878701.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 1.164453125,
|
|
"epoch": 1.7384173841738417,
|
|
"grad_norm": 0.12355723244502534,
|
|
"learning_rate": 6.745808087924475e-07,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.7190010011196136,
|
|
"num_tokens": 655785501.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.7397840645073117,
|
|
"grad_norm": 0.1242845519172022,
|
|
"learning_rate": 6.710581936029308e-07,
|
|
"loss": 1.2004,
|
|
"mean_token_accuracy": 0.7113914966583252,
|
|
"num_tokens": 656688562.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 1.15390625,
|
|
"epoch": 1.7411507448407817,
|
|
"grad_norm": 0.12302206362965608,
|
|
"learning_rate": 6.675355784134143e-07,
|
|
"loss": 1.1581,
|
|
"mean_token_accuracy": 0.7192179322242737,
|
|
"num_tokens": 657644690.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.7425174251742517,
|
|
"grad_norm": 0.12126770914678284,
|
|
"learning_rate": 6.640129632238974e-07,
|
|
"loss": 1.1581,
|
|
"mean_token_accuracy": 0.7228184819221497,
|
|
"num_tokens": 658562782.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.7438841055077217,
|
|
"grad_norm": 0.11372166221574848,
|
|
"learning_rate": 6.604903480343807e-07,
|
|
"loss": 1.2066,
|
|
"mean_token_accuracy": 0.7083667635917663,
|
|
"num_tokens": 659505786.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.7452507858411916,
|
|
"grad_norm": 0.1569811836115092,
|
|
"learning_rate": 6.569677328448641e-07,
|
|
"loss": 1.1502,
|
|
"mean_token_accuracy": 0.7235752820968628,
|
|
"num_tokens": 660399948.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.7466174661746616,
|
|
"grad_norm": 0.11603128930920886,
|
|
"learning_rate": 6.534451176553473e-07,
|
|
"loss": 1.1216,
|
|
"mean_token_accuracy": 0.7284882545471192,
|
|
"num_tokens": 661316636.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 1.1625,
|
|
"epoch": 1.7479841465081316,
|
|
"grad_norm": 0.11626842192509819,
|
|
"learning_rate": 6.499225024658306e-07,
|
|
"loss": 1.1733,
|
|
"mean_token_accuracy": 0.716050523519516,
|
|
"num_tokens": 662261997.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 1.13125,
|
|
"epoch": 1.7493508268416016,
|
|
"grad_norm": 0.115918855396845,
|
|
"learning_rate": 6.46399887276314e-07,
|
|
"loss": 1.1371,
|
|
"mean_token_accuracy": 0.7227889716625213,
|
|
"num_tokens": 663138325.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.7507175071750718,
|
|
"grad_norm": 0.145651229568168,
|
|
"learning_rate": 6.428772720867973e-07,
|
|
"loss": 1.223,
|
|
"mean_token_accuracy": 0.7060853779315949,
|
|
"num_tokens": 664041710.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.7520841875085418,
|
|
"grad_norm": 0.12400310769618726,
|
|
"learning_rate": 6.393546568972805e-07,
|
|
"loss": 1.1799,
|
|
"mean_token_accuracy": 0.715434056520462,
|
|
"num_tokens": 664932815.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 1.18203125,
|
|
"epoch": 1.7534508678420118,
|
|
"grad_norm": 0.12653812441528087,
|
|
"learning_rate": 6.358320417077639e-07,
|
|
"loss": 1.1816,
|
|
"mean_token_accuracy": 0.7154142916202545,
|
|
"num_tokens": 665841431.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 1.15859375,
|
|
"epoch": 1.7548175481754817,
|
|
"grad_norm": 0.14416782234371986,
|
|
"learning_rate": 6.323094265182472e-07,
|
|
"loss": 1.168,
|
|
"mean_token_accuracy": 0.7151417255401611,
|
|
"num_tokens": 666717852.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.7561842285089517,
|
|
"grad_norm": 0.12144813903988871,
|
|
"learning_rate": 6.287868113287306e-07,
|
|
"loss": 1.1247,
|
|
"mean_token_accuracy": 0.7256034016609192,
|
|
"num_tokens": 667672141.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.7575509088424217,
|
|
"grad_norm": 0.12908362226698658,
|
|
"learning_rate": 6.252641961392138e-07,
|
|
"loss": 1.1942,
|
|
"mean_token_accuracy": 0.7117061078548431,
|
|
"num_tokens": 668610796.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 1.125390625,
|
|
"epoch": 1.758917589175892,
|
|
"grad_norm": 0.12227264262938403,
|
|
"learning_rate": 6.217415809496971e-07,
|
|
"loss": 1.1402,
|
|
"mean_token_accuracy": 0.7217134058475494,
|
|
"num_tokens": 669533180.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.760284269509362,
|
|
"grad_norm": 0.11298900512994325,
|
|
"learning_rate": 6.182189657601804e-07,
|
|
"loss": 1.2024,
|
|
"mean_token_accuracy": 0.7124640107154846,
|
|
"num_tokens": 670485826.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.7616509498428319,
|
|
"grad_norm": 0.12353757100413154,
|
|
"learning_rate": 6.146963505706637e-07,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.7160246670246124,
|
|
"num_tokens": 671399671.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 1.7630176301763019,
|
|
"grad_norm": 0.11540115305153534,
|
|
"learning_rate": 6.11173735381147e-07,
|
|
"loss": 1.1788,
|
|
"mean_token_accuracy": 0.7171907424926758,
|
|
"num_tokens": 672290522.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.7643843105097718,
|
|
"grad_norm": 0.10804037279849142,
|
|
"learning_rate": 6.076511201916304e-07,
|
|
"loss": 1.2024,
|
|
"mean_token_accuracy": 0.712280660867691,
|
|
"num_tokens": 673198438.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.7657509908432418,
|
|
"grad_norm": 0.13811403725063284,
|
|
"learning_rate": 6.041285050021136e-07,
|
|
"loss": 1.169,
|
|
"mean_token_accuracy": 0.7169443786144256,
|
|
"num_tokens": 674059965.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.7671176711767118,
|
|
"grad_norm": 0.14106112030891094,
|
|
"learning_rate": 6.006058898125969e-07,
|
|
"loss": 1.1797,
|
|
"mean_token_accuracy": 0.7157745242118836,
|
|
"num_tokens": 675027103.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 1.2296875,
|
|
"epoch": 1.7684843515101818,
|
|
"grad_norm": 0.11715396703705996,
|
|
"learning_rate": 5.970832746230803e-07,
|
|
"loss": 1.2416,
|
|
"mean_token_accuracy": 0.7043656051158905,
|
|
"num_tokens": 676021720.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.7698510318436518,
|
|
"grad_norm": 0.12021676102637138,
|
|
"learning_rate": 5.935606594335636e-07,
|
|
"loss": 1.1874,
|
|
"mean_token_accuracy": 0.7140327632427216,
|
|
"num_tokens": 676989574.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.7712177121771218,
|
|
"grad_norm": 0.2057116153527679,
|
|
"learning_rate": 5.900380442440468e-07,
|
|
"loss": 1.1903,
|
|
"mean_token_accuracy": 0.7125104665756226,
|
|
"num_tokens": 677905078.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.7725843925105917,
|
|
"grad_norm": 0.12321407853690107,
|
|
"learning_rate": 5.8651542905453e-07,
|
|
"loss": 1.2304,
|
|
"mean_token_accuracy": 0.7064425885677338,
|
|
"num_tokens": 678842968.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.7739510728440617,
|
|
"grad_norm": 0.12201494289585278,
|
|
"learning_rate": 5.829928138650135e-07,
|
|
"loss": 1.1584,
|
|
"mean_token_accuracy": 0.720017260313034,
|
|
"num_tokens": 679751995.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.7753177531775317,
|
|
"grad_norm": 0.11725068618845941,
|
|
"learning_rate": 5.794701986754967e-07,
|
|
"loss": 1.1872,
|
|
"mean_token_accuracy": 0.7129684746265411,
|
|
"num_tokens": 680662996.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.7766844335110017,
|
|
"grad_norm": 0.1281245273197947,
|
|
"learning_rate": 5.759475834859801e-07,
|
|
"loss": 1.1924,
|
|
"mean_token_accuracy": 0.7146298348903656,
|
|
"num_tokens": 681586128.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 1.2125,
|
|
"epoch": 1.7780511138444717,
|
|
"grad_norm": 0.1361755771056598,
|
|
"learning_rate": 5.724249682964633e-07,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7067513644695282,
|
|
"num_tokens": 682478027.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.7794177941779417,
|
|
"grad_norm": 0.12316742238243315,
|
|
"learning_rate": 5.689023531069466e-07,
|
|
"loss": 1.1795,
|
|
"mean_token_accuracy": 0.7145565152168274,
|
|
"num_tokens": 683441578.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.7807844745114116,
|
|
"grad_norm": 0.18502931050800078,
|
|
"learning_rate": 5.6537973791743e-07,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.714904111623764,
|
|
"num_tokens": 684345183.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 1.15625,
|
|
"epoch": 1.7821511548448816,
|
|
"grad_norm": 0.12577917865851237,
|
|
"learning_rate": 5.618571227279133e-07,
|
|
"loss": 1.1687,
|
|
"mean_token_accuracy": 0.7199515521526336,
|
|
"num_tokens": 685294484.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.7835178351783518,
|
|
"grad_norm": 0.11933000297664688,
|
|
"learning_rate": 5.583345075383965e-07,
|
|
"loss": 1.1467,
|
|
"mean_token_accuracy": 0.7212702929973602,
|
|
"num_tokens": 686245948.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 1.137890625,
|
|
"epoch": 1.7848845155118218,
|
|
"grad_norm": 0.12506117206471423,
|
|
"learning_rate": 5.548118923488799e-07,
|
|
"loss": 1.1363,
|
|
"mean_token_accuracy": 0.7228946805000305,
|
|
"num_tokens": 687124126.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.7862511958452918,
|
|
"grad_norm": 0.11141168203062383,
|
|
"learning_rate": 5.512892771593632e-07,
|
|
"loss": 1.1505,
|
|
"mean_token_accuracy": 0.7199471235275269,
|
|
"num_tokens": 688056046.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 1.203125,
|
|
"epoch": 1.7876178761787618,
|
|
"grad_norm": 0.11464631001748835,
|
|
"learning_rate": 5.477666619698464e-07,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.7095086395740509,
|
|
"num_tokens": 688992902.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 1.1453125,
|
|
"epoch": 1.7889845565122318,
|
|
"grad_norm": 0.12617354278837797,
|
|
"learning_rate": 5.442440467803298e-07,
|
|
"loss": 1.152,
|
|
"mean_token_accuracy": 0.719246792793274,
|
|
"num_tokens": 689906664.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.7903512368457017,
|
|
"grad_norm": 0.13347467043380887,
|
|
"learning_rate": 5.407214315908131e-07,
|
|
"loss": 1.1883,
|
|
"mean_token_accuracy": 0.7141229093074799,
|
|
"num_tokens": 690848275.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.791717917179172,
|
|
"grad_norm": 0.11795690161787355,
|
|
"learning_rate": 5.371988164012964e-07,
|
|
"loss": 1.1997,
|
|
"mean_token_accuracy": 0.7118562638759613,
|
|
"num_tokens": 691813408.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 1.1328125,
|
|
"epoch": 1.793084597512642,
|
|
"grad_norm": 0.11303012420789099,
|
|
"learning_rate": 5.336762012117797e-07,
|
|
"loss": 1.1388,
|
|
"mean_token_accuracy": 0.7232256352901458,
|
|
"num_tokens": 692750415.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 1.178125,
|
|
"epoch": 1.794451277846112,
|
|
"grad_norm": 0.14978283530267156,
|
|
"learning_rate": 5.30153586022263e-07,
|
|
"loss": 1.1836,
|
|
"mean_token_accuracy": 0.7123375773429871,
|
|
"num_tokens": 693691681.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.795817958179582,
|
|
"grad_norm": 0.12752215129552424,
|
|
"learning_rate": 5.266309708327462e-07,
|
|
"loss": 1.1603,
|
|
"mean_token_accuracy": 0.7188456118106842,
|
|
"num_tokens": 694604144.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 1.13828125,
|
|
"epoch": 1.7971846385130519,
|
|
"grad_norm": 0.12111838194353569,
|
|
"learning_rate": 5.231083556432296e-07,
|
|
"loss": 1.1513,
|
|
"mean_token_accuracy": 0.722715538740158,
|
|
"num_tokens": 695503466.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 1.20859375,
|
|
"epoch": 1.7985513188465219,
|
|
"grad_norm": 0.11233488686317898,
|
|
"learning_rate": 5.195857404537129e-07,
|
|
"loss": 1.2332,
|
|
"mean_token_accuracy": 0.7098071098327636,
|
|
"num_tokens": 696413161.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.7999179991799918,
|
|
"grad_norm": 0.10978518432737694,
|
|
"learning_rate": 5.160631252641961e-07,
|
|
"loss": 1.1692,
|
|
"mean_token_accuracy": 0.7198370039463043,
|
|
"num_tokens": 697305779.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 1.1671875,
|
|
"epoch": 1.8012846795134618,
|
|
"grad_norm": 0.14287526227483183,
|
|
"learning_rate": 5.125405100746795e-07,
|
|
"loss": 1.1758,
|
|
"mean_token_accuracy": 0.7174386262893677,
|
|
"num_tokens": 698254730.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.8026513598469318,
|
|
"grad_norm": 0.1159687990843662,
|
|
"learning_rate": 5.090178948851628e-07,
|
|
"loss": 1.1835,
|
|
"mean_token_accuracy": 0.7128191709518432,
|
|
"num_tokens": 699207989.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 1.8040180401804018,
|
|
"grad_norm": 0.12780407338162644,
|
|
"learning_rate": 5.054952796956461e-07,
|
|
"loss": 1.2131,
|
|
"mean_token_accuracy": 0.7068031787872314,
|
|
"num_tokens": 700108356.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 1.1359375,
|
|
"epoch": 1.8053847205138718,
|
|
"grad_norm": 0.12122294647151574,
|
|
"learning_rate": 5.019726645061294e-07,
|
|
"loss": 1.1367,
|
|
"mean_token_accuracy": 0.7238821089267731,
|
|
"num_tokens": 701047533.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 1.23046875,
|
|
"epoch": 1.8067514008473418,
|
|
"grad_norm": 0.12216062722208466,
|
|
"learning_rate": 4.984500493166127e-07,
|
|
"loss": 1.2388,
|
|
"mean_token_accuracy": 0.7083369135856629,
|
|
"num_tokens": 701981994.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 1.124609375,
|
|
"epoch": 1.8081180811808117,
|
|
"grad_norm": 0.12465892758455947,
|
|
"learning_rate": 4.94927434127096e-07,
|
|
"loss": 1.1264,
|
|
"mean_token_accuracy": 0.725023764371872,
|
|
"num_tokens": 702882139.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 1.14453125,
|
|
"epoch": 1.8094847615142817,
|
|
"grad_norm": 0.1253170164306996,
|
|
"learning_rate": 4.914048189375794e-07,
|
|
"loss": 1.1461,
|
|
"mean_token_accuracy": 0.7211774468421936,
|
|
"num_tokens": 703785858.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.8108514418477517,
|
|
"grad_norm": 0.13986305503330157,
|
|
"learning_rate": 4.878822037480626e-07,
|
|
"loss": 1.1656,
|
|
"mean_token_accuracy": 0.7170386731624603,
|
|
"num_tokens": 704624547.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.8122181221812217,
|
|
"grad_norm": 0.11195030718005543,
|
|
"learning_rate": 4.843595885585459e-07,
|
|
"loss": 1.1663,
|
|
"mean_token_accuracy": 0.7174375712871551,
|
|
"num_tokens": 705518692.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.8135848025146917,
|
|
"grad_norm": 0.1295668861920939,
|
|
"learning_rate": 4.808369733690292e-07,
|
|
"loss": 1.2041,
|
|
"mean_token_accuracy": 0.7086914241313934,
|
|
"num_tokens": 706467325.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 1.8149514828481617,
|
|
"grad_norm": 0.12040101235199166,
|
|
"learning_rate": 4.773143581795125e-07,
|
|
"loss": 1.2432,
|
|
"mean_token_accuracy": 0.7049016714096069,
|
|
"num_tokens": 707440133.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.8163181631816319,
|
|
"grad_norm": 0.13027001151960527,
|
|
"learning_rate": 4.7379174298999583e-07,
|
|
"loss": 1.1235,
|
|
"mean_token_accuracy": 0.7264056861400604,
|
|
"num_tokens": 708341371.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 1.1828125,
|
|
"epoch": 1.8176848435151018,
|
|
"grad_norm": 0.11272336694180865,
|
|
"learning_rate": 4.702691278004791e-07,
|
|
"loss": 1.1929,
|
|
"mean_token_accuracy": 0.714560478925705,
|
|
"num_tokens": 709301345.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 1.171875,
|
|
"epoch": 1.8190515238485718,
|
|
"grad_norm": 0.13935484981617186,
|
|
"learning_rate": 4.6674651261096245e-07,
|
|
"loss": 1.1788,
|
|
"mean_token_accuracy": 0.7144341945648194,
|
|
"num_tokens": 710227083.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 1.1265625,
|
|
"epoch": 1.8204182041820418,
|
|
"grad_norm": 0.1307473307434354,
|
|
"learning_rate": 4.632238974214457e-07,
|
|
"loss": 1.1279,
|
|
"mean_token_accuracy": 0.7249894678592682,
|
|
"num_tokens": 711152913.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 1.152734375,
|
|
"epoch": 1.8217848845155118,
|
|
"grad_norm": 0.11855667821929392,
|
|
"learning_rate": 4.5970128223192907e-07,
|
|
"loss": 1.1486,
|
|
"mean_token_accuracy": 0.7220784664154053,
|
|
"num_tokens": 712073167.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 1.12734375,
|
|
"epoch": 1.8231515648489818,
|
|
"grad_norm": 0.1198888840296408,
|
|
"learning_rate": 4.561786670424123e-07,
|
|
"loss": 1.1317,
|
|
"mean_token_accuracy": 0.7249309837818145,
|
|
"num_tokens": 712972864.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 1.183984375,
|
|
"epoch": 1.824518245182452,
|
|
"grad_norm": 0.11750968083478032,
|
|
"learning_rate": 4.5265605185289563e-07,
|
|
"loss": 1.2047,
|
|
"mean_token_accuracy": 0.71272953748703,
|
|
"num_tokens": 713917176.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.825884925515922,
|
|
"grad_norm": 0.11862800290784825,
|
|
"learning_rate": 4.491334366633789e-07,
|
|
"loss": 1.1404,
|
|
"mean_token_accuracy": 0.7220163941383362,
|
|
"num_tokens": 714855686.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.827251605849392,
|
|
"grad_norm": 0.11675478142033319,
|
|
"learning_rate": 4.456108214738622e-07,
|
|
"loss": 1.1677,
|
|
"mean_token_accuracy": 0.7177301406860351,
|
|
"num_tokens": 715777164.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 1.175,
|
|
"epoch": 1.828618286182862,
|
|
"grad_norm": 0.13333049641142938,
|
|
"learning_rate": 4.4208820628434553e-07,
|
|
"loss": 1.1778,
|
|
"mean_token_accuracy": 0.7169080555438996,
|
|
"num_tokens": 716699260.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.829984966516332,
|
|
"grad_norm": 0.1282707570807869,
|
|
"learning_rate": 4.385655910948288e-07,
|
|
"loss": 1.1948,
|
|
"mean_token_accuracy": 0.7113196790218353,
|
|
"num_tokens": 717625948.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.831351646849802,
|
|
"grad_norm": 0.13962697295857252,
|
|
"learning_rate": 4.3504297590531215e-07,
|
|
"loss": 1.2087,
|
|
"mean_token_accuracy": 0.7093973994255066,
|
|
"num_tokens": 718555781.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.8327183271832719,
|
|
"grad_norm": 0.13881055693202263,
|
|
"learning_rate": 4.3152036071579543e-07,
|
|
"loss": 1.1775,
|
|
"mean_token_accuracy": 0.7176052033901215,
|
|
"num_tokens": 719501185.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.8340850075167419,
|
|
"grad_norm": 0.1256299332071278,
|
|
"learning_rate": 4.2799774552627877e-07,
|
|
"loss": 1.1459,
|
|
"mean_token_accuracy": 0.7214141964912415,
|
|
"num_tokens": 720446046.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 1.1546875,
|
|
"epoch": 1.8354516878502118,
|
|
"grad_norm": 0.13032495069152833,
|
|
"learning_rate": 4.2447513033676205e-07,
|
|
"loss": 1.163,
|
|
"mean_token_accuracy": 0.7192430078983307,
|
|
"num_tokens": 721358206.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 1.15546875,
|
|
"epoch": 1.8368183681836818,
|
|
"grad_norm": 0.11935386623445694,
|
|
"learning_rate": 4.209525151472454e-07,
|
|
"loss": 1.1487,
|
|
"mean_token_accuracy": 0.7211388945579529,
|
|
"num_tokens": 722257844.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.8381850485171518,
|
|
"grad_norm": 0.13694945028718514,
|
|
"learning_rate": 4.1742989995772867e-07,
|
|
"loss": 1.1888,
|
|
"mean_token_accuracy": 0.7111744463443757,
|
|
"num_tokens": 723211347.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 1.2078125,
|
|
"epoch": 1.8395517288506218,
|
|
"grad_norm": 0.12083030647881961,
|
|
"learning_rate": 4.13907284768212e-07,
|
|
"loss": 1.2239,
|
|
"mean_token_accuracy": 0.7082141757011413,
|
|
"num_tokens": 724104007.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.8409184091840918,
|
|
"grad_norm": 0.12777585567706495,
|
|
"learning_rate": 4.1038466957869523e-07,
|
|
"loss": 1.1866,
|
|
"mean_token_accuracy": 0.7148985087871551,
|
|
"num_tokens": 724995973.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 1.19453125,
|
|
"epoch": 1.8422850895175618,
|
|
"grad_norm": 0.11543791459112207,
|
|
"learning_rate": 4.068620543891785e-07,
|
|
"loss": 1.1918,
|
|
"mean_token_accuracy": 0.7158011615276336,
|
|
"num_tokens": 725987719.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 1.16484375,
|
|
"epoch": 1.8436517698510317,
|
|
"grad_norm": 0.1158818218805783,
|
|
"learning_rate": 4.0333943919966185e-07,
|
|
"loss": 1.1771,
|
|
"mean_token_accuracy": 0.7156673192977905,
|
|
"num_tokens": 726964775.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 1.12890625,
|
|
"epoch": 1.8450184501845017,
|
|
"grad_norm": 0.12029719080391667,
|
|
"learning_rate": 3.9981682401014513e-07,
|
|
"loss": 1.128,
|
|
"mean_token_accuracy": 0.7259192287921905,
|
|
"num_tokens": 727930546.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.8463851305179717,
|
|
"grad_norm": 0.12999641243278798,
|
|
"learning_rate": 3.9629420882062847e-07,
|
|
"loss": 1.1643,
|
|
"mean_token_accuracy": 0.7182439088821411,
|
|
"num_tokens": 728842047.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.8477518108514417,
|
|
"grad_norm": 0.12006332161095681,
|
|
"learning_rate": 3.9277159363111175e-07,
|
|
"loss": 1.1787,
|
|
"mean_token_accuracy": 0.7168653726577758,
|
|
"num_tokens": 729768384.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 1.184375,
|
|
"epoch": 1.849118491184912,
|
|
"grad_norm": 0.12254636339745945,
|
|
"learning_rate": 3.892489784415951e-07,
|
|
"loss": 1.1986,
|
|
"mean_token_accuracy": 0.7100696623325348,
|
|
"num_tokens": 730678128.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.8504851715183819,
|
|
"grad_norm": 0.11729964662266415,
|
|
"learning_rate": 3.8572636325207837e-07,
|
|
"loss": 1.2107,
|
|
"mean_token_accuracy": 0.7081273794174194,
|
|
"num_tokens": 731530210.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 1.23203125,
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 0.1310850270868724,
|
|
"learning_rate": 3.822037480625617e-07,
|
|
"loss": 1.2277,
|
|
"mean_token_accuracy": 0.7071851968765259,
|
|
"num_tokens": 732452716.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.8532185321853218,
|
|
"grad_norm": 0.11802915361708056,
|
|
"learning_rate": 3.78681132873045e-07,
|
|
"loss": 1.1261,
|
|
"mean_token_accuracy": 0.7257517814636231,
|
|
"num_tokens": 733438187.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.8545852125187918,
|
|
"grad_norm": 0.12674025525595842,
|
|
"learning_rate": 3.7515851768352827e-07,
|
|
"loss": 1.1899,
|
|
"mean_token_accuracy": 0.7129082024097443,
|
|
"num_tokens": 734353210.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.8559518928522618,
|
|
"grad_norm": 0.14050447500664975,
|
|
"learning_rate": 3.716359024940116e-07,
|
|
"loss": 1.2061,
|
|
"mean_token_accuracy": 0.7082083523273468,
|
|
"num_tokens": 735236466.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 1.857318573185732,
|
|
"grad_norm": 0.1225527314876881,
|
|
"learning_rate": 3.6811328730449484e-07,
|
|
"loss": 1.2022,
|
|
"mean_token_accuracy": 0.7092620313167572,
|
|
"num_tokens": 736179977.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.858685253519202,
|
|
"grad_norm": 0.12278867569232142,
|
|
"learning_rate": 3.645906721149782e-07,
|
|
"loss": 1.1677,
|
|
"mean_token_accuracy": 0.7167464554309845,
|
|
"num_tokens": 737048433.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.860051933852672,
|
|
"grad_norm": 0.12233813657146451,
|
|
"learning_rate": 3.6106805692546145e-07,
|
|
"loss": 1.1725,
|
|
"mean_token_accuracy": 0.7167407870292664,
|
|
"num_tokens": 738015859.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 1.861418614186142,
|
|
"grad_norm": 0.13411864866442208,
|
|
"learning_rate": 3.575454417359448e-07,
|
|
"loss": 1.2094,
|
|
"mean_token_accuracy": 0.7090843200683594,
|
|
"num_tokens": 738920866.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 1.17421875,
|
|
"epoch": 1.862785294519612,
|
|
"grad_norm": 0.1798688852885377,
|
|
"learning_rate": 3.5402282654642807e-07,
|
|
"loss": 1.1661,
|
|
"mean_token_accuracy": 0.7184894680976868,
|
|
"num_tokens": 739814821.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.864151974853082,
|
|
"grad_norm": 0.12330712641087124,
|
|
"learning_rate": 3.505002113569114e-07,
|
|
"loss": 1.2057,
|
|
"mean_token_accuracy": 0.7090600669384003,
|
|
"num_tokens": 740758504.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.865518655186552,
|
|
"grad_norm": 0.19329834724040132,
|
|
"learning_rate": 3.469775961673947e-07,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.7120966494083405,
|
|
"num_tokens": 741643496.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 1.13203125,
|
|
"epoch": 1.866885335520022,
|
|
"grad_norm": 0.11892771578579936,
|
|
"learning_rate": 3.43454980977878e-07,
|
|
"loss": 1.1379,
|
|
"mean_token_accuracy": 0.7271609842777252,
|
|
"num_tokens": 742542144.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.8682520158534919,
|
|
"grad_norm": 0.13289696194068668,
|
|
"learning_rate": 3.399323657883613e-07,
|
|
"loss": 1.1912,
|
|
"mean_token_accuracy": 0.712562495470047,
|
|
"num_tokens": 743452088.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.8696186961869619,
|
|
"grad_norm": 0.1166691412021652,
|
|
"learning_rate": 3.364097505988446e-07,
|
|
"loss": 1.2084,
|
|
"mean_token_accuracy": 0.7104374885559082,
|
|
"num_tokens": 744413817.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.8709853765204318,
|
|
"grad_norm": 0.12341033875855971,
|
|
"learning_rate": 3.328871354093279e-07,
|
|
"loss": 1.2022,
|
|
"mean_token_accuracy": 0.7100586473941803,
|
|
"num_tokens": 745368873.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.8723520568539018,
|
|
"grad_norm": 0.13692812099132576,
|
|
"learning_rate": 3.293645202198112e-07,
|
|
"loss": 1.1907,
|
|
"mean_token_accuracy": 0.7135661602020263,
|
|
"num_tokens": 746297787.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.8737187371873718,
|
|
"grad_norm": 0.11452917075073579,
|
|
"learning_rate": 3.2584190503029454e-07,
|
|
"loss": 1.198,
|
|
"mean_token_accuracy": 0.710840517282486,
|
|
"num_tokens": 747185629.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.8750854175208418,
|
|
"grad_norm": 0.13022155054497178,
|
|
"learning_rate": 3.223192898407778e-07,
|
|
"loss": 1.1689,
|
|
"mean_token_accuracy": 0.718472707271576,
|
|
"num_tokens": 748116243.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.8764520978543118,
|
|
"grad_norm": 0.10980571359824398,
|
|
"learning_rate": 3.1879667465126116e-07,
|
|
"loss": 1.1847,
|
|
"mean_token_accuracy": 0.7168484628200531,
|
|
"num_tokens": 749050129.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 1.16953125,
|
|
"epoch": 1.8778187781877818,
|
|
"grad_norm": 0.13672227830904504,
|
|
"learning_rate": 3.152740594617444e-07,
|
|
"loss": 1.173,
|
|
"mean_token_accuracy": 0.7172909080982208,
|
|
"num_tokens": 750023119.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 1.2,
|
|
"epoch": 1.8791854585212517,
|
|
"grad_norm": 0.11627434354270369,
|
|
"learning_rate": 3.117514442722277e-07,
|
|
"loss": 1.197,
|
|
"mean_token_accuracy": 0.7140404760837555,
|
|
"num_tokens": 750970721.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 1.1078125,
|
|
"epoch": 1.8805521388547217,
|
|
"grad_norm": 0.1205706751051687,
|
|
"learning_rate": 3.08228829082711e-07,
|
|
"loss": 1.1172,
|
|
"mean_token_accuracy": 0.7265090405941009,
|
|
"num_tokens": 751935926.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.881918819188192,
|
|
"grad_norm": 0.1262649518761016,
|
|
"learning_rate": 3.0470621389319434e-07,
|
|
"loss": 1.1875,
|
|
"mean_token_accuracy": 0.7142050087451934,
|
|
"num_tokens": 752834374.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.883285499521662,
|
|
"grad_norm": 0.11708836777128852,
|
|
"learning_rate": 3.0118359870367763e-07,
|
|
"loss": 1.1542,
|
|
"mean_token_accuracy": 0.7225340485572815,
|
|
"num_tokens": 753707513.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.884652179855132,
|
|
"grad_norm": 0.11492230956719325,
|
|
"learning_rate": 2.9766098351416096e-07,
|
|
"loss": 1.154,
|
|
"mean_token_accuracy": 0.7206594407558441,
|
|
"num_tokens": 754638258.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 1.16875,
|
|
"epoch": 1.8860188601886019,
|
|
"grad_norm": 0.13144130645953883,
|
|
"learning_rate": 2.9413836832464424e-07,
|
|
"loss": 1.1808,
|
|
"mean_token_accuracy": 0.7145294845104218,
|
|
"num_tokens": 755597165.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 1.21484375,
|
|
"epoch": 1.8873855405220719,
|
|
"grad_norm": 0.11223076173167154,
|
|
"learning_rate": 2.906157531351276e-07,
|
|
"loss": 1.2271,
|
|
"mean_token_accuracy": 0.7088210582733154,
|
|
"num_tokens": 756528198.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.8887522208555418,
|
|
"grad_norm": 0.1320990793402249,
|
|
"learning_rate": 2.8709313794561086e-07,
|
|
"loss": 1.1875,
|
|
"mean_token_accuracy": 0.715322858095169,
|
|
"num_tokens": 757471912.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 1.18671875,
|
|
"epoch": 1.890118901189012,
|
|
"grad_norm": 0.1192385731587495,
|
|
"learning_rate": 2.8357052275609415e-07,
|
|
"loss": 1.1826,
|
|
"mean_token_accuracy": 0.7137760043144226,
|
|
"num_tokens": 758350949.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 1.22734375,
|
|
"epoch": 1.891485581522482,
|
|
"grad_norm": 0.17219678864540836,
|
|
"learning_rate": 2.8004790756657743e-07,
|
|
"loss": 1.2317,
|
|
"mean_token_accuracy": 0.7080485045909881,
|
|
"num_tokens": 759272260.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.892852261855952,
|
|
"grad_norm": 0.1338464350763704,
|
|
"learning_rate": 2.7652529237706076e-07,
|
|
"loss": 1.1467,
|
|
"mean_token_accuracy": 0.7208349108695984,
|
|
"num_tokens": 760187752.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 1.12265625,
|
|
"epoch": 1.894218942189422,
|
|
"grad_norm": 0.11950677194853809,
|
|
"learning_rate": 2.7300267718754405e-07,
|
|
"loss": 1.1272,
|
|
"mean_token_accuracy": 0.7267131209373474,
|
|
"num_tokens": 761081492.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 1.101953125,
|
|
"epoch": 1.895585622522892,
|
|
"grad_norm": 0.12752907018187795,
|
|
"learning_rate": 2.694800619980274e-07,
|
|
"loss": 1.1107,
|
|
"mean_token_accuracy": 0.7285680830478668,
|
|
"num_tokens": 761999391.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.896952302856362,
|
|
"grad_norm": 0.13981913498698612,
|
|
"learning_rate": 2.6595744680851066e-07,
|
|
"loss": 1.1535,
|
|
"mean_token_accuracy": 0.7179949283599854,
|
|
"num_tokens": 762955304.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 1.190625,
|
|
"epoch": 1.898318983189832,
|
|
"grad_norm": 0.11655432128878566,
|
|
"learning_rate": 2.6243483161899395e-07,
|
|
"loss": 1.2008,
|
|
"mean_token_accuracy": 0.7118051707744598,
|
|
"num_tokens": 763856099.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.899685663523302,
|
|
"grad_norm": 0.1996289410028969,
|
|
"learning_rate": 2.589122164294773e-07,
|
|
"loss": 1.1816,
|
|
"mean_token_accuracy": 0.714921623468399,
|
|
"num_tokens": 764830762.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 1.1921875,
|
|
"epoch": 1.901052343856772,
|
|
"grad_norm": 0.13151410024401256,
|
|
"learning_rate": 2.5538960123996056e-07,
|
|
"loss": 1.2114,
|
|
"mean_token_accuracy": 0.7105202913284302,
|
|
"num_tokens": 765744796.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.902419024190242,
|
|
"grad_norm": 0.12050160185793975,
|
|
"learning_rate": 2.518669860504439e-07,
|
|
"loss": 1.1748,
|
|
"mean_token_accuracy": 0.7142186999320984,
|
|
"num_tokens": 766618248.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.9037857045237119,
|
|
"grad_norm": 0.13579394866750716,
|
|
"learning_rate": 2.483443708609272e-07,
|
|
"loss": 1.1955,
|
|
"mean_token_accuracy": 0.7100520312786103,
|
|
"num_tokens": 767525077.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 1.148046875,
|
|
"epoch": 1.9051523848571819,
|
|
"grad_norm": 0.12337514373365811,
|
|
"learning_rate": 2.4482175567141046e-07,
|
|
"loss": 1.1552,
|
|
"mean_token_accuracy": 0.7191293656826019,
|
|
"num_tokens": 768457310.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 1.1953125,
|
|
"epoch": 1.9065190651906518,
|
|
"grad_norm": 0.10980290477643742,
|
|
"learning_rate": 2.4129914048189375e-07,
|
|
"loss": 1.1982,
|
|
"mean_token_accuracy": 0.7109768629074097,
|
|
"num_tokens": 769426821.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 1.19140625,
|
|
"epoch": 1.9078857455241218,
|
|
"grad_norm": 0.10964927963373734,
|
|
"learning_rate": 2.3777652529237708e-07,
|
|
"loss": 1.206,
|
|
"mean_token_accuracy": 0.7133993923664093,
|
|
"num_tokens": 770363833.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 1.196875,
|
|
"epoch": 1.9092524258575918,
|
|
"grad_norm": 0.14128235335653905,
|
|
"learning_rate": 2.342539101028604e-07,
|
|
"loss": 1.2129,
|
|
"mean_token_accuracy": 0.7099532127380371,
|
|
"num_tokens": 771329422.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 1.179296875,
|
|
"epoch": 1.9106191061910618,
|
|
"grad_norm": 0.18166590777382072,
|
|
"learning_rate": 2.307312949133437e-07,
|
|
"loss": 1.1933,
|
|
"mean_token_accuracy": 0.7115510821342468,
|
|
"num_tokens": 772239714.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 1.1578125,
|
|
"epoch": 1.9119857865245318,
|
|
"grad_norm": 0.12011012627186146,
|
|
"learning_rate": 2.2720867972382698e-07,
|
|
"loss": 1.1664,
|
|
"mean_token_accuracy": 0.7170770823955536,
|
|
"num_tokens": 773159765.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 1.195703125,
|
|
"epoch": 1.9133524668580018,
|
|
"grad_norm": 0.1097789561851833,
|
|
"learning_rate": 2.236860645343103e-07,
|
|
"loss": 1.188,
|
|
"mean_token_accuracy": 0.7140045464038849,
|
|
"num_tokens": 774055620.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 1.137890625,
|
|
"epoch": 1.914719147191472,
|
|
"grad_norm": 0.13290087074441512,
|
|
"learning_rate": 2.201634493447936e-07,
|
|
"loss": 1.1348,
|
|
"mean_token_accuracy": 0.7235208868980407,
|
|
"num_tokens": 774953572.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.916085827524942,
|
|
"grad_norm": 0.11495794350402981,
|
|
"learning_rate": 2.166408341552769e-07,
|
|
"loss": 1.1587,
|
|
"mean_token_accuracy": 0.717841362953186,
|
|
"num_tokens": 775855755.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.917452507858412,
|
|
"grad_norm": 0.10599846299185038,
|
|
"learning_rate": 2.131182189657602e-07,
|
|
"loss": 1.1639,
|
|
"mean_token_accuracy": 0.7185395061969757,
|
|
"num_tokens": 776759120.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 1.1609375,
|
|
"epoch": 1.918819188191882,
|
|
"grad_norm": 0.12123542688681803,
|
|
"learning_rate": 2.095956037762435e-07,
|
|
"loss": 1.1707,
|
|
"mean_token_accuracy": 0.7168347716331482,
|
|
"num_tokens": 777681575.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 1.1796875,
|
|
"epoch": 1.920185868525352,
|
|
"grad_norm": 0.11166122574484914,
|
|
"learning_rate": 2.0607298858672678e-07,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.7166872560977936,
|
|
"num_tokens": 778608066.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.9215525488588219,
|
|
"grad_norm": 0.12148902350026093,
|
|
"learning_rate": 2.025503733972101e-07,
|
|
"loss": 1.1716,
|
|
"mean_token_accuracy": 0.717884635925293,
|
|
"num_tokens": 779540511.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 1.22890625,
|
|
"epoch": 1.922919229192292,
|
|
"grad_norm": 0.12152148597659393,
|
|
"learning_rate": 1.990277582076934e-07,
|
|
"loss": 1.2376,
|
|
"mean_token_accuracy": 0.705209881067276,
|
|
"num_tokens": 780497748.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 1.14765625,
|
|
"epoch": 1.924285909525762,
|
|
"grad_norm": 0.12222977157980315,
|
|
"learning_rate": 1.955051430181767e-07,
|
|
"loss": 1.1358,
|
|
"mean_token_accuracy": 0.7211433470249176,
|
|
"num_tokens": 781361904.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 1.14375,
|
|
"epoch": 1.925652589859232,
|
|
"grad_norm": 0.1276031425106376,
|
|
"learning_rate": 1.9198252782866002e-07,
|
|
"loss": 1.1434,
|
|
"mean_token_accuracy": 0.7235897064208985,
|
|
"num_tokens": 782271999.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 1.16640625,
|
|
"epoch": 1.927019270192702,
|
|
"grad_norm": 0.11162788118138933,
|
|
"learning_rate": 1.8845991263914333e-07,
|
|
"loss": 1.1653,
|
|
"mean_token_accuracy": 0.7187773525714874,
|
|
"num_tokens": 783195601.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 1.19296875,
|
|
"epoch": 1.928385950526172,
|
|
"grad_norm": 0.12573632110345603,
|
|
"learning_rate": 1.8493729744962664e-07,
|
|
"loss": 1.1971,
|
|
"mean_token_accuracy": 0.7109823822975159,
|
|
"num_tokens": 784042465.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 1.2171875,
|
|
"epoch": 1.929752630859642,
|
|
"grad_norm": 0.11088993003931938,
|
|
"learning_rate": 1.8141468226010995e-07,
|
|
"loss": 1.2273,
|
|
"mean_token_accuracy": 0.7088818192481995,
|
|
"num_tokens": 784982291.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 1.151171875,
|
|
"epoch": 1.931119311193112,
|
|
"grad_norm": 0.12081343894361574,
|
|
"learning_rate": 1.778920670705932e-07,
|
|
"loss": 1.1478,
|
|
"mean_token_accuracy": 0.723282665014267,
|
|
"num_tokens": 785905912.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 1.1703125,
|
|
"epoch": 1.932485991526582,
|
|
"grad_norm": 0.1618816007216651,
|
|
"learning_rate": 1.743694518810765e-07,
|
|
"loss": 1.1722,
|
|
"mean_token_accuracy": 0.717379093170166,
|
|
"num_tokens": 786850843.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 1.15703125,
|
|
"epoch": 1.933852671860052,
|
|
"grad_norm": 0.17569509716845663,
|
|
"learning_rate": 1.7084683669155982e-07,
|
|
"loss": 1.1417,
|
|
"mean_token_accuracy": 0.7225930750370025,
|
|
"num_tokens": 787779785.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 1.1984375,
|
|
"epoch": 1.935219352193522,
|
|
"grad_norm": 0.13550911179628267,
|
|
"learning_rate": 1.6732422150204313e-07,
|
|
"loss": 1.1972,
|
|
"mean_token_accuracy": 0.7121036231517792,
|
|
"num_tokens": 788675803.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.936586032526992,
|
|
"grad_norm": 0.12256910971410273,
|
|
"learning_rate": 1.6380160631252644e-07,
|
|
"loss": 1.1696,
|
|
"mean_token_accuracy": 0.7182317495346069,
|
|
"num_tokens": 789644704.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 1.16015625,
|
|
"epoch": 1.937952712860462,
|
|
"grad_norm": 0.12444225078235649,
|
|
"learning_rate": 1.6027899112300975e-07,
|
|
"loss": 1.1525,
|
|
"mean_token_accuracy": 0.7200830578804016,
|
|
"num_tokens": 790554611.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 1.15234375,
|
|
"epoch": 1.9393193931939319,
|
|
"grad_norm": 0.1447352528548004,
|
|
"learning_rate": 1.5675637593349303e-07,
|
|
"loss": 1.1419,
|
|
"mean_token_accuracy": 0.7243234157562256,
|
|
"num_tokens": 791500398.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 1.171484375,
|
|
"epoch": 1.9406860735274019,
|
|
"grad_norm": 0.11937518482895305,
|
|
"learning_rate": 1.5323376074397634e-07,
|
|
"loss": 1.162,
|
|
"mean_token_accuracy": 0.7183200478553772,
|
|
"num_tokens": 792446160.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 1.23515625,
|
|
"epoch": 1.9420527538608718,
|
|
"grad_norm": 0.12935306722512405,
|
|
"learning_rate": 1.4971114555445965e-07,
|
|
"loss": 1.2463,
|
|
"mean_token_accuracy": 0.7011185467243195,
|
|
"num_tokens": 793381577.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 1.1875,
|
|
"epoch": 1.9434194341943418,
|
|
"grad_norm": 0.12006158358901137,
|
|
"learning_rate": 1.4618853036494293e-07,
|
|
"loss": 1.2086,
|
|
"mean_token_accuracy": 0.7086874008178711,
|
|
"num_tokens": 794280020.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.9447861145278118,
|
|
"grad_norm": 0.1309966012634864,
|
|
"learning_rate": 1.4266591517542624e-07,
|
|
"loss": 1.191,
|
|
"mean_token_accuracy": 0.7126041054725647,
|
|
"num_tokens": 795177200.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 1.159375,
|
|
"epoch": 1.9461527948612818,
|
|
"grad_norm": 0.11441065815351266,
|
|
"learning_rate": 1.3914329998590955e-07,
|
|
"loss": 1.1601,
|
|
"mean_token_accuracy": 0.7202122032642364,
|
|
"num_tokens": 796038174.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.947519475194752,
|
|
"grad_norm": 0.11654883075331667,
|
|
"learning_rate": 1.3562068479639286e-07,
|
|
"loss": 1.1649,
|
|
"mean_token_accuracy": 0.7191001892089843,
|
|
"num_tokens": 797001573.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 1.13515625,
|
|
"epoch": 1.948886155528222,
|
|
"grad_norm": 0.1224590226369271,
|
|
"learning_rate": 1.3209806960687614e-07,
|
|
"loss": 1.1371,
|
|
"mean_token_accuracy": 0.7235355019569397,
|
|
"num_tokens": 797884361.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.950252835861692,
|
|
"grad_norm": 0.11713991637124449,
|
|
"learning_rate": 1.2857545441735945e-07,
|
|
"loss": 1.1748,
|
|
"mean_token_accuracy": 0.7169823348522186,
|
|
"num_tokens": 798793029.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 1.20078125,
|
|
"epoch": 1.951619516195162,
|
|
"grad_norm": 0.1213668873456502,
|
|
"learning_rate": 1.2505283922784276e-07,
|
|
"loss": 1.1977,
|
|
"mean_token_accuracy": 0.713214635848999,
|
|
"num_tokens": 799744420.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 1.146875,
|
|
"epoch": 1.952986196528632,
|
|
"grad_norm": 0.10943296820506572,
|
|
"learning_rate": 1.2153022403832607e-07,
|
|
"loss": 1.1619,
|
|
"mean_token_accuracy": 0.7174571275711059,
|
|
"num_tokens": 800678918.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 1.17734375,
|
|
"epoch": 1.954352876862102,
|
|
"grad_norm": 0.12329864386490005,
|
|
"learning_rate": 1.1800760884880937e-07,
|
|
"loss": 1.1854,
|
|
"mean_token_accuracy": 0.7137609601020813,
|
|
"num_tokens": 801642779.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 1.1640625,
|
|
"epoch": 1.9557195571955721,
|
|
"grad_norm": 0.31897841648749387,
|
|
"learning_rate": 1.1448499365929266e-07,
|
|
"loss": 1.1727,
|
|
"mean_token_accuracy": 0.7158215343952179,
|
|
"num_tokens": 802572358.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 1.18515625,
|
|
"epoch": 1.957086237529042,
|
|
"grad_norm": 0.12546218411650284,
|
|
"learning_rate": 1.1096237846977597e-07,
|
|
"loss": 1.1855,
|
|
"mean_token_accuracy": 0.714429748058319,
|
|
"num_tokens": 803489079.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 1.18984375,
|
|
"epoch": 1.958452917862512,
|
|
"grad_norm": 0.13712119277541082,
|
|
"learning_rate": 1.0743976328025928e-07,
|
|
"loss": 1.1794,
|
|
"mean_token_accuracy": 0.7133537650108337,
|
|
"num_tokens": 804423484.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 1.140625,
|
|
"epoch": 1.959819598195982,
|
|
"grad_norm": 0.12939287413458536,
|
|
"learning_rate": 1.0391714809074258e-07,
|
|
"loss": 1.1419,
|
|
"mean_token_accuracy": 0.721429044008255,
|
|
"num_tokens": 805356906.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 1.17109375,
|
|
"epoch": 1.961186278529452,
|
|
"grad_norm": 0.1829015570996705,
|
|
"learning_rate": 1.0039453290122588e-07,
|
|
"loss": 1.1801,
|
|
"mean_token_accuracy": 0.7133006095886231,
|
|
"num_tokens": 806266885.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 1.125,
|
|
"epoch": 1.962552958862922,
|
|
"grad_norm": 0.16055973581973068,
|
|
"learning_rate": 9.687191771170918e-08,
|
|
"loss": 1.1261,
|
|
"mean_token_accuracy": 0.7247130155563355,
|
|
"num_tokens": 807170234.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 1.19765625,
|
|
"epoch": 1.963919639196392,
|
|
"grad_norm": 0.12286905535278175,
|
|
"learning_rate": 9.334930252219248e-08,
|
|
"loss": 1.2075,
|
|
"mean_token_accuracy": 0.7116467356681824,
|
|
"num_tokens": 808131415.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 1.165625,
|
|
"epoch": 1.965286319529862,
|
|
"grad_norm": 0.11301653563611809,
|
|
"learning_rate": 8.982668733267578e-08,
|
|
"loss": 1.1732,
|
|
"mean_token_accuracy": 0.7166952252388,
|
|
"num_tokens": 809068797.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.966652999863332,
|
|
"grad_norm": 0.11734592398361644,
|
|
"learning_rate": 8.630407214315909e-08,
|
|
"loss": 1.1359,
|
|
"mean_token_accuracy": 0.7242200791835784,
|
|
"num_tokens": 809985873.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 1.21640625,
|
|
"epoch": 1.968019680196802,
|
|
"grad_norm": 0.12286464441217436,
|
|
"learning_rate": 8.27814569536424e-08,
|
|
"loss": 1.2198,
|
|
"mean_token_accuracy": 0.709374076128006,
|
|
"num_tokens": 810920966.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 1.18046875,
|
|
"epoch": 1.969386360530272,
|
|
"grad_norm": 0.1263412511417876,
|
|
"learning_rate": 7.925884176412568e-08,
|
|
"loss": 1.1946,
|
|
"mean_token_accuracy": 0.7119766235351562,
|
|
"num_tokens": 811822412.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 1.20625,
|
|
"epoch": 1.970753040863742,
|
|
"grad_norm": 0.12512236936812257,
|
|
"learning_rate": 7.573622657460899e-08,
|
|
"loss": 1.2171,
|
|
"mean_token_accuracy": 0.7085229814052582,
|
|
"num_tokens": 812735537.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.972119721197212,
|
|
"grad_norm": 0.11801226172172616,
|
|
"learning_rate": 7.22136113850923e-08,
|
|
"loss": 1.1307,
|
|
"mean_token_accuracy": 0.7251711249351501,
|
|
"num_tokens": 813662549.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 1.15078125,
|
|
"epoch": 1.9734864015306819,
|
|
"grad_norm": 0.11949815242339845,
|
|
"learning_rate": 6.869099619557561e-08,
|
|
"loss": 1.1528,
|
|
"mean_token_accuracy": 0.718987226486206,
|
|
"num_tokens": 814584828.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 1.1484375,
|
|
"epoch": 1.9748530818641519,
|
|
"grad_norm": 0.13943139179001762,
|
|
"learning_rate": 6.51683810060589e-08,
|
|
"loss": 1.1643,
|
|
"mean_token_accuracy": 0.7168914914131165,
|
|
"num_tokens": 815545404.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 1.1859375,
|
|
"epoch": 1.9762197621976219,
|
|
"grad_norm": 0.13018784102943434,
|
|
"learning_rate": 6.164576581654221e-08,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.7131885051727295,
|
|
"num_tokens": 816427771.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 1.15,
|
|
"epoch": 1.9775864425310918,
|
|
"grad_norm": 0.13801870144635667,
|
|
"learning_rate": 5.8123150627025515e-08,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.7215281903743744,
|
|
"num_tokens": 817322924.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 1.14609375,
|
|
"epoch": 1.9789531228645618,
|
|
"grad_norm": 0.11320591861256196,
|
|
"learning_rate": 5.460053543750881e-08,
|
|
"loss": 1.1536,
|
|
"mean_token_accuracy": 0.7212979793548584,
|
|
"num_tokens": 818179468.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 1.2015625,
|
|
"epoch": 1.980319803198032,
|
|
"grad_norm": 0.147948508038159,
|
|
"learning_rate": 5.107792024799211e-08,
|
|
"loss": 1.2162,
|
|
"mean_token_accuracy": 0.7081980645656586,
|
|
"num_tokens": 819098758.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 1.22109375,
|
|
"epoch": 1.981686483531502,
|
|
"grad_norm": 0.13557459432229862,
|
|
"learning_rate": 4.7555305058475415e-08,
|
|
"loss": 1.2304,
|
|
"mean_token_accuracy": 0.7063135087490082,
|
|
"num_tokens": 819982458.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 1.1765625,
|
|
"epoch": 1.983053163864972,
|
|
"grad_norm": 0.13915637517456245,
|
|
"learning_rate": 4.403268986895872e-08,
|
|
"loss": 1.2003,
|
|
"mean_token_accuracy": 0.7114025235176087,
|
|
"num_tokens": 820947496.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 1.19609375,
|
|
"epoch": 1.984419844198442,
|
|
"grad_norm": 0.12121060960666286,
|
|
"learning_rate": 4.0510074679442026e-08,
|
|
"loss": 1.2086,
|
|
"mean_token_accuracy": 0.7090893447399139,
|
|
"num_tokens": 821871008.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 1.16328125,
|
|
"epoch": 1.985786524531912,
|
|
"grad_norm": 0.12776407187546393,
|
|
"learning_rate": 3.698745948992532e-08,
|
|
"loss": 1.161,
|
|
"mean_token_accuracy": 0.7180455505847931,
|
|
"num_tokens": 822733922.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 1.13046875,
|
|
"epoch": 1.987153204865382,
|
|
"grad_norm": 0.11406187350132849,
|
|
"learning_rate": 3.3464844300408624e-08,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.7218894839286805,
|
|
"num_tokens": 823606493.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 1.1296875,
|
|
"epoch": 1.9885198851988521,
|
|
"grad_norm": 0.12086463470696587,
|
|
"learning_rate": 2.994222911089193e-08,
|
|
"loss": 1.1209,
|
|
"mean_token_accuracy": 0.7278454780578614,
|
|
"num_tokens": 824545058.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 1.18125,
|
|
"epoch": 1.9898865655323221,
|
|
"grad_norm": 0.12982098826262375,
|
|
"learning_rate": 2.6419613921375232e-08,
|
|
"loss": 1.1835,
|
|
"mean_token_accuracy": 0.716684204339981,
|
|
"num_tokens": 825494030.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 1.13359375,
|
|
"epoch": 1.991253245865792,
|
|
"grad_norm": 0.1352682907219869,
|
|
"learning_rate": 2.289699873185853e-08,
|
|
"loss": 1.142,
|
|
"mean_token_accuracy": 0.7188388288021088,
|
|
"num_tokens": 826419109.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 1.21796875,
|
|
"epoch": 1.992619926199262,
|
|
"grad_norm": 0.11550421977406476,
|
|
"learning_rate": 1.9374383542341837e-08,
|
|
"loss": 1.2192,
|
|
"mean_token_accuracy": 0.7060228884220123,
|
|
"num_tokens": 827302291.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 1.1375,
|
|
"epoch": 1.993986606532732,
|
|
"grad_norm": 0.11797804272113778,
|
|
"learning_rate": 1.585176835282514e-08,
|
|
"loss": 1.1262,
|
|
"mean_token_accuracy": 0.7259816229343414,
|
|
"num_tokens": 828222506.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 1.23125,
|
|
"epoch": 1.995353286866202,
|
|
"grad_norm": 0.1602147728422691,
|
|
"learning_rate": 1.2329153163308442e-08,
|
|
"loss": 1.2305,
|
|
"mean_token_accuracy": 0.705960875749588,
|
|
"num_tokens": 829164578.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 1.17890625,
|
|
"epoch": 1.996719967199672,
|
|
"grad_norm": 0.12009901245595929,
|
|
"learning_rate": 8.806537973791744e-09,
|
|
"loss": 1.1859,
|
|
"mean_token_accuracy": 0.7137560307979584,
|
|
"num_tokens": 830121770.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 1.16796875,
|
|
"epoch": 1.998086647533142,
|
|
"grad_norm": 0.12919863168758175,
|
|
"learning_rate": 5.2839227842750465e-09,
|
|
"loss": 1.1809,
|
|
"mean_token_accuracy": 0.7144694268703461,
|
|
"num_tokens": 831072613.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 1.18828125,
|
|
"epoch": 1.999453327866612,
|
|
"grad_norm": 0.12870458242456068,
|
|
"learning_rate": 1.7613075947583486e-09,
|
|
"loss": 1.184,
|
|
"mean_token_accuracy": 0.7157665252685547,
|
|
"num_tokens": 831998003.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 1.17578125,
|
|
"epoch": 2.0,
|
|
"mean_token_accuracy": 0.7135747969150543,
|
|
"num_tokens": 832377069.0,
|
|
"step": 14634,
|
|
"total_flos": 2.197196595737395e+16,
|
|
"train_loss": 0.7360902870387283,
|
|
"train_runtime": 33470.8922,
|
|
"train_samples_per_second": 55.96,
|
|
"train_steps_per_second": 0.437
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 14634,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2.197196595737395e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|