Files
1.7b-MixtureVitae-web_curat…/trainer_state.json
ModelHub XC a04e4f2b55 初始化项目,由ModelHub XC社区提供模型
Model: ali-elganzory/1.7b-MixtureVitae-web_curated-100BT-longsft_16k-SFT-Tulu3-decontaminated
Source: Original Platform
2026-05-14 22:38:41 +08:00

14677 lines
409 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 14634,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.2921875,
"epoch": 0.0013666803334700013,
"grad_norm": 0.20590318438119218,
"learning_rate": 1.0227272727272728e-07,
"loss": 1.4251,
"mean_token_accuracy": 0.6811124742031097,
"num_tokens": 938571.0,
"step": 10
},
{
"entropy": 1.29140625,
"epoch": 0.0027333606669400026,
"grad_norm": 0.25638555133322033,
"learning_rate": 2.1590909090909094e-07,
"loss": 1.3921,
"mean_token_accuracy": 0.6900001168251038,
"num_tokens": 1829597.0,
"step": 20
},
{
"entropy": 1.2765625,
"epoch": 0.004100041000410004,
"grad_norm": 0.22350546146450515,
"learning_rate": 3.2954545454545455e-07,
"loss": 1.4012,
"mean_token_accuracy": 0.6867515265941619,
"num_tokens": 2756097.0,
"step": 30
},
{
"entropy": 1.31328125,
"epoch": 0.005466721333880005,
"grad_norm": 0.21591046526826038,
"learning_rate": 4.431818181818182e-07,
"loss": 1.422,
"mean_token_accuracy": 0.6827306509017944,
"num_tokens": 3667157.0,
"step": 40
},
{
"entropy": 1.265625,
"epoch": 0.006833401667350007,
"grad_norm": 0.2271614144743479,
"learning_rate": 5.568181818181818e-07,
"loss": 1.3766,
"mean_token_accuracy": 0.6905148029327393,
"num_tokens": 4593212.0,
"step": 50
},
{
"entropy": 1.3,
"epoch": 0.008200082000820008,
"grad_norm": 0.20413303667156643,
"learning_rate": 6.704545454545456e-07,
"loss": 1.3965,
"mean_token_accuracy": 0.6875518321990967,
"num_tokens": 5559539.0,
"step": 60
},
{
"entropy": 1.35390625,
"epoch": 0.00956676233429001,
"grad_norm": 0.20097705942757133,
"learning_rate": 7.840909090909092e-07,
"loss": 1.4669,
"mean_token_accuracy": 0.6763997912406922,
"num_tokens": 6518318.0,
"step": 70
},
{
"entropy": 1.35390625,
"epoch": 0.01093344266776001,
"grad_norm": 0.20311043983620206,
"learning_rate": 8.977272727272728e-07,
"loss": 1.4648,
"mean_token_accuracy": 0.6748988211154938,
"num_tokens": 7462414.0,
"step": 80
},
{
"entropy": 1.303125,
"epoch": 0.012300123001230012,
"grad_norm": 0.2025090696257044,
"learning_rate": 1.0113636363636365e-06,
"loss": 1.413,
"mean_token_accuracy": 0.6829967021942138,
"num_tokens": 8328147.0,
"step": 90
},
{
"entropy": 1.3015625,
"epoch": 0.013666803334700014,
"grad_norm": 0.2911125597291178,
"learning_rate": 1.125e-06,
"loss": 1.4101,
"mean_token_accuracy": 0.6864591300487518,
"num_tokens": 9256532.0,
"step": 100
},
{
"entropy": 1.296875,
"epoch": 0.015033483668170014,
"grad_norm": 0.15839872874711747,
"learning_rate": 1.2386363636363638e-06,
"loss": 1.4003,
"mean_token_accuracy": 0.6870242118835449,
"num_tokens": 10191350.0,
"step": 110
},
{
"entropy": 1.3015625,
"epoch": 0.016400164001640016,
"grad_norm": 0.14768293954653666,
"learning_rate": 1.3522727272727273e-06,
"loss": 1.4025,
"mean_token_accuracy": 0.6872741997241973,
"num_tokens": 11096376.0,
"step": 120
},
{
"entropy": 1.29609375,
"epoch": 0.017766844335110017,
"grad_norm": 0.15521963805813038,
"learning_rate": 1.465909090909091e-06,
"loss": 1.3871,
"mean_token_accuracy": 0.6885065555572509,
"num_tokens": 12025075.0,
"step": 130
},
{
"entropy": 1.33984375,
"epoch": 0.01913352466858002,
"grad_norm": 0.1384360265615404,
"learning_rate": 1.5795454545454547e-06,
"loss": 1.4255,
"mean_token_accuracy": 0.6830733776092529,
"num_tokens": 12961701.0,
"step": 140
},
{
"entropy": 1.3375,
"epoch": 0.02050020500205002,
"grad_norm": 0.13450464381913704,
"learning_rate": 1.6931818181818182e-06,
"loss": 1.4295,
"mean_token_accuracy": 0.681973659992218,
"num_tokens": 13905979.0,
"step": 150
},
{
"entropy": 1.315625,
"epoch": 0.02186688533552002,
"grad_norm": 0.13366037454259724,
"learning_rate": 1.8068181818181822e-06,
"loss": 1.4003,
"mean_token_accuracy": 0.6858269691467285,
"num_tokens": 14850397.0,
"step": 160
},
{
"entropy": 1.3546875,
"epoch": 0.023233565668990024,
"grad_norm": 0.13244530534858756,
"learning_rate": 1.9204545454545457e-06,
"loss": 1.438,
"mean_token_accuracy": 0.6813508450984955,
"num_tokens": 15703108.0,
"step": 170
},
{
"entropy": 1.26484375,
"epoch": 0.024600246002460024,
"grad_norm": 0.10492842459608154,
"learning_rate": 2.034090909090909e-06,
"loss": 1.3274,
"mean_token_accuracy": 0.6968121647834777,
"num_tokens": 16615464.0,
"step": 180
},
{
"entropy": 1.28125,
"epoch": 0.025966926335930025,
"grad_norm": 0.10565197200822471,
"learning_rate": 2.147727272727273e-06,
"loss": 1.3458,
"mean_token_accuracy": 0.6926308929920196,
"num_tokens": 17581557.0,
"step": 190
},
{
"entropy": 1.3421875,
"epoch": 0.02733360666940003,
"grad_norm": 0.1146572601766438,
"learning_rate": 2.2613636363636366e-06,
"loss": 1.4123,
"mean_token_accuracy": 0.6848664879798889,
"num_tokens": 18451335.0,
"step": 200
},
{
"entropy": 1.325,
"epoch": 0.02870028700287003,
"grad_norm": 0.11009319489475179,
"learning_rate": 2.375e-06,
"loss": 1.3865,
"mean_token_accuracy": 0.6851190984249115,
"num_tokens": 19392109.0,
"step": 210
},
{
"entropy": 1.3484375,
"epoch": 0.03006696733634003,
"grad_norm": 0.11577726283083871,
"learning_rate": 2.488636363636364e-06,
"loss": 1.3994,
"mean_token_accuracy": 0.6856881737709045,
"num_tokens": 20323700.0,
"step": 220
},
{
"entropy": 1.28671875,
"epoch": 0.03143364766981003,
"grad_norm": 0.09908738494564062,
"learning_rate": 2.6022727272727276e-06,
"loss": 1.3338,
"mean_token_accuracy": 0.6960058510303497,
"num_tokens": 21263745.0,
"step": 230
},
{
"entropy": 1.33984375,
"epoch": 0.03280032800328003,
"grad_norm": 0.10528018799457148,
"learning_rate": 2.715909090909091e-06,
"loss": 1.3739,
"mean_token_accuracy": 0.6884757697582244,
"num_tokens": 22195458.0,
"step": 240
},
{
"entropy": 1.33984375,
"epoch": 0.034167008336750036,
"grad_norm": 0.09176130404418655,
"learning_rate": 2.829545454545455e-06,
"loss": 1.3681,
"mean_token_accuracy": 0.6913634181022644,
"num_tokens": 23115090.0,
"step": 250
},
{
"entropy": 1.309375,
"epoch": 0.03553368867022003,
"grad_norm": 0.08898105428903243,
"learning_rate": 2.9431818181818185e-06,
"loss": 1.3408,
"mean_token_accuracy": 0.694323194026947,
"num_tokens": 24043510.0,
"step": 260
},
{
"entropy": 1.3078125,
"epoch": 0.03690036900369004,
"grad_norm": 0.09357245461841163,
"learning_rate": 3.056818181818182e-06,
"loss": 1.3319,
"mean_token_accuracy": 0.6955293536186218,
"num_tokens": 24975455.0,
"step": 270
},
{
"entropy": 1.31875,
"epoch": 0.03826704933716004,
"grad_norm": 0.08182500648680509,
"learning_rate": 3.1704545454545456e-06,
"loss": 1.3311,
"mean_token_accuracy": 0.695158714056015,
"num_tokens": 25915880.0,
"step": 280
},
{
"entropy": 1.33671875,
"epoch": 0.03963372967063004,
"grad_norm": 0.0811007885385848,
"learning_rate": 3.2840909090909095e-06,
"loss": 1.3444,
"mean_token_accuracy": 0.693206787109375,
"num_tokens": 26836649.0,
"step": 290
},
{
"entropy": 1.31796875,
"epoch": 0.04100041000410004,
"grad_norm": 0.084837358791642,
"learning_rate": 3.397727272727273e-06,
"loss": 1.3436,
"mean_token_accuracy": 0.694383192062378,
"num_tokens": 27758649.0,
"step": 300
},
{
"entropy": 1.35546875,
"epoch": 0.042367090337570044,
"grad_norm": 0.0832358502895584,
"learning_rate": 3.5113636363636365e-06,
"loss": 1.362,
"mean_token_accuracy": 0.6926927030086517,
"num_tokens": 28704182.0,
"step": 310
},
{
"entropy": 1.3109375,
"epoch": 0.04373377067104004,
"grad_norm": 0.0821744198002814,
"learning_rate": 3.625e-06,
"loss": 1.3198,
"mean_token_accuracy": 0.6963011741638183,
"num_tokens": 29605445.0,
"step": 320
},
{
"entropy": 1.40234375,
"epoch": 0.045100451004510045,
"grad_norm": 0.08348693863370858,
"learning_rate": 3.7386363636363635e-06,
"loss": 1.4108,
"mean_token_accuracy": 0.6820845186710358,
"num_tokens": 30515940.0,
"step": 330
},
{
"entropy": 1.2875,
"epoch": 0.04646713133798005,
"grad_norm": 0.07948410313995227,
"learning_rate": 3.852272727272728e-06,
"loss": 1.2947,
"mean_token_accuracy": 0.7019649386405945,
"num_tokens": 31408267.0,
"step": 340
},
{
"entropy": 1.26796875,
"epoch": 0.047833811671450045,
"grad_norm": 0.07849661690415582,
"learning_rate": 3.965909090909091e-06,
"loss": 1.2704,
"mean_token_accuracy": 0.7060164988040925,
"num_tokens": 32349688.0,
"step": 350
},
{
"entropy": 1.315625,
"epoch": 0.04920049200492005,
"grad_norm": 0.07019499497172645,
"learning_rate": 4.079545454545455e-06,
"loss": 1.3246,
"mean_token_accuracy": 0.6947045445442199,
"num_tokens": 33292692.0,
"step": 360
},
{
"entropy": 1.34609375,
"epoch": 0.05056717233839005,
"grad_norm": 0.08241900091369597,
"learning_rate": 4.193181818181819e-06,
"loss": 1.3504,
"mean_token_accuracy": 0.6920416057109833,
"num_tokens": 34223187.0,
"step": 370
},
{
"entropy": 1.36328125,
"epoch": 0.05193385267186005,
"grad_norm": 0.0821654564274366,
"learning_rate": 4.306818181818182e-06,
"loss": 1.3734,
"mean_token_accuracy": 0.6870876729488373,
"num_tokens": 35129123.0,
"step": 380
},
{
"entropy": 1.28515625,
"epoch": 0.05330053300533005,
"grad_norm": 0.0754978246998772,
"learning_rate": 4.420454545454546e-06,
"loss": 1.2933,
"mean_token_accuracy": 0.701804655790329,
"num_tokens": 36084818.0,
"step": 390
},
{
"entropy": 1.3046875,
"epoch": 0.05466721333880006,
"grad_norm": 0.082712540605289,
"learning_rate": 4.53409090909091e-06,
"loss": 1.2935,
"mean_token_accuracy": 0.7025778353214264,
"num_tokens": 36969815.0,
"step": 400
},
{
"entropy": 1.353125,
"epoch": 0.05603389367227005,
"grad_norm": 0.09042403617837913,
"learning_rate": 4.647727272727273e-06,
"loss": 1.3651,
"mean_token_accuracy": 0.6885217905044556,
"num_tokens": 37874200.0,
"step": 410
},
{
"entropy": 1.30078125,
"epoch": 0.05740057400574006,
"grad_norm": 0.07732729724622497,
"learning_rate": 4.761363636363637e-06,
"loss": 1.316,
"mean_token_accuracy": 0.698377650976181,
"num_tokens": 38814930.0,
"step": 420
},
{
"entropy": 1.27578125,
"epoch": 0.05876725433921006,
"grad_norm": 0.08147479872797253,
"learning_rate": 4.875e-06,
"loss": 1.2781,
"mean_token_accuracy": 0.7035331726074219,
"num_tokens": 39757293.0,
"step": 430
},
{
"entropy": 1.29765625,
"epoch": 0.06013393467268006,
"grad_norm": 0.07550887304060089,
"learning_rate": 4.988636363636364e-06,
"loss": 1.3073,
"mean_token_accuracy": 0.6974103450775146,
"num_tokens": 40713143.0,
"step": 440
},
{
"entropy": 1.34609375,
"epoch": 0.06150061500615006,
"grad_norm": 0.08031629788008746,
"learning_rate": 4.996829646329435e-06,
"loss": 1.3389,
"mean_token_accuracy": 0.692697161436081,
"num_tokens": 41638884.0,
"step": 450
},
{
"entropy": 1.3796875,
"epoch": 0.06286729533962006,
"grad_norm": 0.07728708031976339,
"learning_rate": 4.993307031139919e-06,
"loss": 1.3859,
"mean_token_accuracy": 0.6853989422321319,
"num_tokens": 42583453.0,
"step": 460
},
{
"entropy": 1.31953125,
"epoch": 0.06423397567309007,
"grad_norm": 0.12459835536382881,
"learning_rate": 4.989784415950402e-06,
"loss": 1.3214,
"mean_token_accuracy": 0.6952377259731293,
"num_tokens": 43522757.0,
"step": 470
},
{
"entropy": 1.3734375,
"epoch": 0.06560065600656007,
"grad_norm": 0.0812797588431013,
"learning_rate": 4.986261800760885e-06,
"loss": 1.3744,
"mean_token_accuracy": 0.6862836360931397,
"num_tokens": 44444147.0,
"step": 480
},
{
"entropy": 1.30078125,
"epoch": 0.06696733634003006,
"grad_norm": 0.07399811680686263,
"learning_rate": 4.9827391855713685e-06,
"loss": 1.2996,
"mean_token_accuracy": 0.701492989063263,
"num_tokens": 45400305.0,
"step": 490
},
{
"entropy": 1.3171875,
"epoch": 0.06833401667350007,
"grad_norm": 0.07820082156843089,
"learning_rate": 4.979216570381852e-06,
"loss": 1.3057,
"mean_token_accuracy": 0.6980636060237885,
"num_tokens": 46325614.0,
"step": 500
},
{
"entropy": 1.328125,
"epoch": 0.06970069700697007,
"grad_norm": 0.075793107283648,
"learning_rate": 4.975693955192335e-06,
"loss": 1.3369,
"mean_token_accuracy": 0.6917027294635772,
"num_tokens": 47238170.0,
"step": 510
},
{
"entropy": 1.340625,
"epoch": 0.07106737734044007,
"grad_norm": 0.07932616077808104,
"learning_rate": 4.972171340002819e-06,
"loss": 1.3447,
"mean_token_accuracy": 0.6915920794010162,
"num_tokens": 48194911.0,
"step": 520
},
{
"entropy": 1.3515625,
"epoch": 0.07243405767391008,
"grad_norm": 0.08398975723628085,
"learning_rate": 4.968648724813302e-06,
"loss": 1.3533,
"mean_token_accuracy": 0.6909829080104828,
"num_tokens": 49112005.0,
"step": 530
},
{
"entropy": 1.31640625,
"epoch": 0.07380073800738007,
"grad_norm": 0.07989260382049766,
"learning_rate": 4.965126109623785e-06,
"loss": 1.3218,
"mean_token_accuracy": 0.69542036652565,
"num_tokens": 50015461.0,
"step": 540
},
{
"entropy": 1.33203125,
"epoch": 0.07516741834085007,
"grad_norm": 0.07408836115558076,
"learning_rate": 4.961603494434268e-06,
"loss": 1.3411,
"mean_token_accuracy": 0.6931326866149903,
"num_tokens": 50935643.0,
"step": 550
},
{
"entropy": 1.30703125,
"epoch": 0.07653409867432008,
"grad_norm": 0.07858752296753967,
"learning_rate": 4.958080879244752e-06,
"loss": 1.3148,
"mean_token_accuracy": 0.6965982973575592,
"num_tokens": 51878002.0,
"step": 560
},
{
"entropy": 1.290625,
"epoch": 0.07790077900779008,
"grad_norm": 0.0905676198109956,
"learning_rate": 4.954558264055234e-06,
"loss": 1.3011,
"mean_token_accuracy": 0.6977157056331634,
"num_tokens": 52758343.0,
"step": 570
},
{
"entropy": 1.28671875,
"epoch": 0.07926745934126007,
"grad_norm": 0.1004741429606398,
"learning_rate": 4.951035648865719e-06,
"loss": 1.2789,
"mean_token_accuracy": 0.7038198471069336,
"num_tokens": 53650442.0,
"step": 580
},
{
"entropy": 1.28984375,
"epoch": 0.08063413967473008,
"grad_norm": 0.0993156910857053,
"learning_rate": 4.9475130336762015e-06,
"loss": 1.2963,
"mean_token_accuracy": 0.6983868420124054,
"num_tokens": 54555977.0,
"step": 590
},
{
"entropy": 1.32421875,
"epoch": 0.08200082000820008,
"grad_norm": 0.08117674628249906,
"learning_rate": 4.943990418486685e-06,
"loss": 1.3193,
"mean_token_accuracy": 0.6938058018684388,
"num_tokens": 55522236.0,
"step": 600
},
{
"entropy": 1.290625,
"epoch": 0.08336750034167008,
"grad_norm": 0.07836203547457134,
"learning_rate": 4.9404678032971685e-06,
"loss": 1.2911,
"mean_token_accuracy": 0.7005116820335389,
"num_tokens": 56427197.0,
"step": 610
},
{
"entropy": 1.28828125,
"epoch": 0.08473418067514009,
"grad_norm": 0.08490088093246893,
"learning_rate": 4.936945188107651e-06,
"loss": 1.3012,
"mean_token_accuracy": 0.7000757873058319,
"num_tokens": 57362921.0,
"step": 620
},
{
"entropy": 1.24609375,
"epoch": 0.08610086100861009,
"grad_norm": 0.07897937015461103,
"learning_rate": 4.933422572918135e-06,
"loss": 1.2433,
"mean_token_accuracy": 0.7085789263248443,
"num_tokens": 58256680.0,
"step": 630
},
{
"entropy": 1.2890625,
"epoch": 0.08746754134208008,
"grad_norm": 0.07810078348428369,
"learning_rate": 4.929899957728618e-06,
"loss": 1.2813,
"mean_token_accuracy": 0.7032000303268433,
"num_tokens": 59193508.0,
"step": 640
},
{
"entropy": 1.2890625,
"epoch": 0.08883422167555009,
"grad_norm": 0.09096412375798649,
"learning_rate": 4.926377342539102e-06,
"loss": 1.2773,
"mean_token_accuracy": 0.7024648666381836,
"num_tokens": 60108420.0,
"step": 650
},
{
"entropy": 1.2640625,
"epoch": 0.09020090200902009,
"grad_norm": 0.08583016300549597,
"learning_rate": 4.922854727349585e-06,
"loss": 1.2534,
"mean_token_accuracy": 0.7075643420219422,
"num_tokens": 61034298.0,
"step": 660
},
{
"entropy": 1.315625,
"epoch": 0.09156758234249009,
"grad_norm": 0.08777853715343714,
"learning_rate": 4.919332112160068e-06,
"loss": 1.3111,
"mean_token_accuracy": 0.6977550685405731,
"num_tokens": 61983537.0,
"step": 670
},
{
"entropy": 1.26640625,
"epoch": 0.0929342626759601,
"grad_norm": 0.09021670026999723,
"learning_rate": 4.915809496970551e-06,
"loss": 1.2674,
"mean_token_accuracy": 0.7047162652015686,
"num_tokens": 62935993.0,
"step": 680
},
{
"entropy": 1.35625,
"epoch": 0.0943009430094301,
"grad_norm": 0.08222603116668331,
"learning_rate": 4.912286881781035e-06,
"loss": 1.3712,
"mean_token_accuracy": 0.68455491065979,
"num_tokens": 63853924.0,
"step": 690
},
{
"entropy": 1.278125,
"epoch": 0.09566762334290009,
"grad_norm": 0.08265910517567637,
"learning_rate": 4.908764266591518e-06,
"loss": 1.2776,
"mean_token_accuracy": 0.70172478556633,
"num_tokens": 64763931.0,
"step": 700
},
{
"entropy": 1.30703125,
"epoch": 0.0970343036763701,
"grad_norm": 0.08089115067606378,
"learning_rate": 4.9052416514020015e-06,
"loss": 1.3,
"mean_token_accuracy": 0.6979008257389069,
"num_tokens": 65673140.0,
"step": 710
},
{
"entropy": 1.2515625,
"epoch": 0.0984009840098401,
"grad_norm": 0.090015043502394,
"learning_rate": 4.901719036212484e-06,
"loss": 1.2576,
"mean_token_accuracy": 0.70569606423378,
"num_tokens": 66640391.0,
"step": 720
},
{
"entropy": 1.27734375,
"epoch": 0.0997676643433101,
"grad_norm": 0.08060833385035761,
"learning_rate": 4.898196421022968e-06,
"loss": 1.2811,
"mean_token_accuracy": 0.7026139676570893,
"num_tokens": 67563282.0,
"step": 730
},
{
"entropy": 1.2515625,
"epoch": 0.1011343446767801,
"grad_norm": 0.09202776253304187,
"learning_rate": 4.894673805833451e-06,
"loss": 1.2331,
"mean_token_accuracy": 0.7098193883895874,
"num_tokens": 68416165.0,
"step": 740
},
{
"entropy": 1.278125,
"epoch": 0.1025010250102501,
"grad_norm": 0.07431162863468342,
"learning_rate": 4.891151190643935e-06,
"loss": 1.2748,
"mean_token_accuracy": 0.7037934124469757,
"num_tokens": 69336094.0,
"step": 750
},
{
"entropy": 1.2875,
"epoch": 0.1038677053437201,
"grad_norm": 0.25992690555253783,
"learning_rate": 4.8876285754544175e-06,
"loss": 1.2835,
"mean_token_accuracy": 0.7011581182479858,
"num_tokens": 70292744.0,
"step": 760
},
{
"entropy": 1.33046875,
"epoch": 0.10523438567719011,
"grad_norm": 0.09898828207879566,
"learning_rate": 4.884105960264901e-06,
"loss": 1.3327,
"mean_token_accuracy": 0.6926266431808472,
"num_tokens": 71192370.0,
"step": 770
},
{
"entropy": 1.28984375,
"epoch": 0.1066010660106601,
"grad_norm": 0.1784777015998728,
"learning_rate": 4.880583345075385e-06,
"loss": 1.2788,
"mean_token_accuracy": 0.7014510631561279,
"num_tokens": 72109962.0,
"step": 780
},
{
"entropy": 1.24765625,
"epoch": 0.1079677463441301,
"grad_norm": 0.07729748442042797,
"learning_rate": 4.877060729885867e-06,
"loss": 1.2497,
"mean_token_accuracy": 0.7058468222618103,
"num_tokens": 73022537.0,
"step": 790
},
{
"entropy": 1.2703125,
"epoch": 0.10933442667760011,
"grad_norm": 0.10472877541823362,
"learning_rate": 4.873538114696351e-06,
"loss": 1.2664,
"mean_token_accuracy": 0.7035557746887207,
"num_tokens": 73932084.0,
"step": 800
},
{
"entropy": 1.303125,
"epoch": 0.11070110701107011,
"grad_norm": 0.08514918195156679,
"learning_rate": 4.870015499506834e-06,
"loss": 1.301,
"mean_token_accuracy": 0.6995691657066345,
"num_tokens": 74843608.0,
"step": 810
},
{
"entropy": 1.27421875,
"epoch": 0.1120677873445401,
"grad_norm": 0.07821035598014474,
"learning_rate": 4.866492884317318e-06,
"loss": 1.2682,
"mean_token_accuracy": 0.703374195098877,
"num_tokens": 75769415.0,
"step": 820
},
{
"entropy": 1.24453125,
"epoch": 0.11343446767801012,
"grad_norm": 0.08250912928098288,
"learning_rate": 4.862970269127801e-06,
"loss": 1.248,
"mean_token_accuracy": 0.7085954666137695,
"num_tokens": 76676625.0,
"step": 830
},
{
"entropy": 1.27421875,
"epoch": 0.11480114801148011,
"grad_norm": 0.0838663692665798,
"learning_rate": 4.859447653938284e-06,
"loss": 1.2674,
"mean_token_accuracy": 0.7029982686042786,
"num_tokens": 77575710.0,
"step": 840
},
{
"entropy": 1.26796875,
"epoch": 0.11616782834495011,
"grad_norm": 0.07704236012712347,
"learning_rate": 4.855925038748768e-06,
"loss": 1.2724,
"mean_token_accuracy": 0.7048835754394531,
"num_tokens": 78573992.0,
"step": 850
},
{
"entropy": 1.28984375,
"epoch": 0.11753450867842012,
"grad_norm": 0.07611479337857986,
"learning_rate": 4.852402423559251e-06,
"loss": 1.2827,
"mean_token_accuracy": 0.7024235785007477,
"num_tokens": 79532329.0,
"step": 860
},
{
"entropy": 1.31015625,
"epoch": 0.11890118901189012,
"grad_norm": 0.07867841791087987,
"learning_rate": 4.848879808369734e-06,
"loss": 1.3179,
"mean_token_accuracy": 0.6961228549480438,
"num_tokens": 80438221.0,
"step": 870
},
{
"entropy": 1.27109375,
"epoch": 0.12026786934536011,
"grad_norm": 0.08028415419715763,
"learning_rate": 4.8453571931802175e-06,
"loss": 1.254,
"mean_token_accuracy": 0.7076728880405426,
"num_tokens": 81376762.0,
"step": 880
},
{
"entropy": 1.22265625,
"epoch": 0.12163454967883013,
"grad_norm": 0.07794372655102948,
"learning_rate": 4.8418345779907e-06,
"loss": 1.2225,
"mean_token_accuracy": 0.7126479685306549,
"num_tokens": 82272468.0,
"step": 890
},
{
"entropy": 1.315625,
"epoch": 0.12300123001230012,
"grad_norm": 0.0889859089851918,
"learning_rate": 4.838311962801184e-06,
"loss": 1.3196,
"mean_token_accuracy": 0.6967448174953461,
"num_tokens": 83215846.0,
"step": 900
},
{
"entropy": 1.2703125,
"epoch": 0.12436791034577012,
"grad_norm": 0.08295902831447972,
"learning_rate": 4.834789347611667e-06,
"loss": 1.2697,
"mean_token_accuracy": 0.7040178775787354,
"num_tokens": 84093524.0,
"step": 910
},
{
"entropy": 1.27734375,
"epoch": 0.12573459067924012,
"grad_norm": 0.09943770868099991,
"learning_rate": 4.831266732422151e-06,
"loss": 1.2631,
"mean_token_accuracy": 0.7054253041744232,
"num_tokens": 84978921.0,
"step": 920
},
{
"entropy": 1.22421875,
"epoch": 0.12710127101271013,
"grad_norm": 0.08786232646713033,
"learning_rate": 4.827744117232634e-06,
"loss": 1.212,
"mean_token_accuracy": 0.7150096893310547,
"num_tokens": 85868292.0,
"step": 930
},
{
"entropy": 1.271875,
"epoch": 0.12846795134618014,
"grad_norm": 0.0946290796869984,
"learning_rate": 4.824221502043117e-06,
"loss": 1.2569,
"mean_token_accuracy": 0.7059077620506287,
"num_tokens": 86759570.0,
"step": 940
},
{
"entropy": 1.27734375,
"epoch": 0.12983463167965012,
"grad_norm": 0.07423077151379168,
"learning_rate": 4.820698886853601e-06,
"loss": 1.273,
"mean_token_accuracy": 0.701602292060852,
"num_tokens": 87742990.0,
"step": 950
},
{
"entropy": 1.29609375,
"epoch": 0.13120131201312013,
"grad_norm": 0.08156480466922142,
"learning_rate": 4.817176271664084e-06,
"loss": 1.2883,
"mean_token_accuracy": 0.6999460458755493,
"num_tokens": 88685843.0,
"step": 960
},
{
"entropy": 1.2515625,
"epoch": 0.13256799234659014,
"grad_norm": 0.08027121994486086,
"learning_rate": 4.813653656474567e-06,
"loss": 1.2422,
"mean_token_accuracy": 0.708583551645279,
"num_tokens": 89560923.0,
"step": 970
},
{
"entropy": 1.2765625,
"epoch": 0.13393467268006012,
"grad_norm": 0.09762049590178451,
"learning_rate": 4.8101310412850505e-06,
"loss": 1.2822,
"mean_token_accuracy": 0.7005646705627442,
"num_tokens": 90477434.0,
"step": 980
},
{
"entropy": 1.2296875,
"epoch": 0.13530135301353013,
"grad_norm": 0.11906452540402349,
"learning_rate": 4.806608426095534e-06,
"loss": 1.2358,
"mean_token_accuracy": 0.7110221087932587,
"num_tokens": 91445593.0,
"step": 990
},
{
"entropy": 1.221875,
"epoch": 0.13666803334700015,
"grad_norm": 0.06983820711604101,
"learning_rate": 4.803085810906017e-06,
"loss": 1.2066,
"mean_token_accuracy": 0.7144997894763947,
"num_tokens": 92372246.0,
"step": 1000
},
{
"entropy": 1.31484375,
"epoch": 0.13803471368047013,
"grad_norm": 0.07347743593137977,
"learning_rate": 4.7995631957165e-06,
"loss": 1.3129,
"mean_token_accuracy": 0.6944493293762207,
"num_tokens": 93322975.0,
"step": 1010
},
{
"entropy": 1.2234375,
"epoch": 0.13940139401394014,
"grad_norm": 0.08749704971685424,
"learning_rate": 4.796040580526984e-06,
"loss": 1.2244,
"mean_token_accuracy": 0.711865234375,
"num_tokens": 94243005.0,
"step": 1020
},
{
"entropy": 1.25625,
"epoch": 0.14076807434741015,
"grad_norm": 0.11984689309964754,
"learning_rate": 4.792517965337467e-06,
"loss": 1.2566,
"mean_token_accuracy": 0.7058712303638458,
"num_tokens": 95128367.0,
"step": 1030
},
{
"entropy": 1.25078125,
"epoch": 0.14213475468088013,
"grad_norm": 0.08869552963159176,
"learning_rate": 4.78899535014795e-06,
"loss": 1.2535,
"mean_token_accuracy": 0.7077961444854737,
"num_tokens": 96052977.0,
"step": 1040
},
{
"entropy": 1.253125,
"epoch": 0.14350143501435014,
"grad_norm": 0.07145596936160792,
"learning_rate": 4.785472734958434e-06,
"loss": 1.2468,
"mean_token_accuracy": 0.7078384280204773,
"num_tokens": 96997179.0,
"step": 1050
},
{
"entropy": 1.265625,
"epoch": 0.14486811534782015,
"grad_norm": 0.07577902651029755,
"learning_rate": 4.781950119768916e-06,
"loss": 1.2622,
"mean_token_accuracy": 0.705233120918274,
"num_tokens": 97934565.0,
"step": 1060
},
{
"entropy": 1.2390625,
"epoch": 0.14623479568129014,
"grad_norm": 0.0768825680217554,
"learning_rate": 4.778427504579401e-06,
"loss": 1.2152,
"mean_token_accuracy": 0.7137013077735901,
"num_tokens": 98864499.0,
"step": 1070
},
{
"entropy": 1.20234375,
"epoch": 0.14760147601476015,
"grad_norm": 0.06897826948850266,
"learning_rate": 4.774904889389883e-06,
"loss": 1.2001,
"mean_token_accuracy": 0.7159342586994171,
"num_tokens": 99822130.0,
"step": 1080
},
{
"entropy": 1.2421875,
"epoch": 0.14896815634823016,
"grad_norm": 0.0808165613245565,
"learning_rate": 4.771382274200367e-06,
"loss": 1.2456,
"mean_token_accuracy": 0.7091240406036377,
"num_tokens": 100744706.0,
"step": 1090
},
{
"entropy": 1.2796875,
"epoch": 0.15033483668170014,
"grad_norm": 0.07184351544770547,
"learning_rate": 4.76785965901085e-06,
"loss": 1.2739,
"mean_token_accuracy": 0.703175950050354,
"num_tokens": 101705020.0,
"step": 1100
},
{
"entropy": 1.315625,
"epoch": 0.15170151701517015,
"grad_norm": 0.0807302551860253,
"learning_rate": 4.764337043821333e-06,
"loss": 1.321,
"mean_token_accuracy": 0.694986367225647,
"num_tokens": 102616456.0,
"step": 1110
},
{
"entropy": 1.28359375,
"epoch": 0.15306819734864016,
"grad_norm": 0.07967853764185674,
"learning_rate": 4.760814428631817e-06,
"loss": 1.2683,
"mean_token_accuracy": 0.7049009442329407,
"num_tokens": 103534288.0,
"step": 1120
},
{
"entropy": 1.27421875,
"epoch": 0.15443487768211014,
"grad_norm": 0.07903675206823298,
"learning_rate": 4.7572918134423e-06,
"loss": 1.2628,
"mean_token_accuracy": 0.703869616985321,
"num_tokens": 104412112.0,
"step": 1130
},
{
"entropy": 1.19296875,
"epoch": 0.15580155801558015,
"grad_norm": 0.06464633726333277,
"learning_rate": 4.753769198252783e-06,
"loss": 1.1988,
"mean_token_accuracy": 0.7166968941688537,
"num_tokens": 105339912.0,
"step": 1140
},
{
"entropy": 1.31953125,
"epoch": 0.15716823834905017,
"grad_norm": 0.0862678868585903,
"learning_rate": 4.7502465830632665e-06,
"loss": 1.321,
"mean_token_accuracy": 0.6942552208900452,
"num_tokens": 106300822.0,
"step": 1150
},
{
"entropy": 1.2109375,
"epoch": 0.15853491868252015,
"grad_norm": 0.07035429331453753,
"learning_rate": 4.74672396787375e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.7169293820858001,
"num_tokens": 107234943.0,
"step": 1160
},
{
"entropy": 1.23125,
"epoch": 0.15990159901599016,
"grad_norm": 0.11494137627402178,
"learning_rate": 4.743201352684233e-06,
"loss": 1.2238,
"mean_token_accuracy": 0.7129312455654144,
"num_tokens": 108169288.0,
"step": 1170
},
{
"entropy": 1.2625,
"epoch": 0.16126827934946017,
"grad_norm": 0.08025352767532193,
"learning_rate": 4.739678737494716e-06,
"loss": 1.2634,
"mean_token_accuracy": 0.7080941021442413,
"num_tokens": 109066252.0,
"step": 1180
},
{
"entropy": 1.265625,
"epoch": 0.16263495968293015,
"grad_norm": 0.07040871182298959,
"learning_rate": 4.7361561223052e-06,
"loss": 1.2552,
"mean_token_accuracy": 0.7062826573848724,
"num_tokens": 109981241.0,
"step": 1190
},
{
"entropy": 1.23125,
"epoch": 0.16400164001640016,
"grad_norm": 0.09807538344809065,
"learning_rate": 4.7326335071156834e-06,
"loss": 1.2228,
"mean_token_accuracy": 0.7126657068729401,
"num_tokens": 110887025.0,
"step": 1200
},
{
"entropy": 1.2609375,
"epoch": 0.16536832034987017,
"grad_norm": 0.07009183622789018,
"learning_rate": 4.729110891926166e-06,
"loss": 1.26,
"mean_token_accuracy": 0.7070395708084106,
"num_tokens": 111835136.0,
"step": 1210
},
{
"entropy": 1.2234375,
"epoch": 0.16673500068334016,
"grad_norm": 0.08713784906007616,
"learning_rate": 4.72558827673665e-06,
"loss": 1.2312,
"mean_token_accuracy": 0.7117593646049499,
"num_tokens": 112715049.0,
"step": 1220
},
{
"entropy": 1.2390625,
"epoch": 0.16810168101681017,
"grad_norm": 0.07459950732581268,
"learning_rate": 4.722065661547132e-06,
"loss": 1.2511,
"mean_token_accuracy": 0.7085782706737518,
"num_tokens": 113645212.0,
"step": 1230
},
{
"entropy": 1.2703125,
"epoch": 0.16946836135028018,
"grad_norm": 0.08237967619133317,
"learning_rate": 4.718543046357617e-06,
"loss": 1.2594,
"mean_token_accuracy": 0.7056242167949677,
"num_tokens": 114607757.0,
"step": 1240
},
{
"entropy": 1.20703125,
"epoch": 0.17083504168375016,
"grad_norm": 0.09707333014201057,
"learning_rate": 4.7150204311680995e-06,
"loss": 1.2059,
"mean_token_accuracy": 0.715604817867279,
"num_tokens": 115481536.0,
"step": 1250
},
{
"entropy": 1.27421875,
"epoch": 0.17220172201722017,
"grad_norm": 0.07888693046270022,
"learning_rate": 4.711497815978583e-06,
"loss": 1.2614,
"mean_token_accuracy": 0.7047868728637695,
"num_tokens": 116398355.0,
"step": 1260
},
{
"entropy": 1.27421875,
"epoch": 0.17356840235069018,
"grad_norm": 0.07006294463577548,
"learning_rate": 4.707975200789066e-06,
"loss": 1.2822,
"mean_token_accuracy": 0.7012380719184875,
"num_tokens": 117363895.0,
"step": 1270
},
{
"entropy": 1.2484375,
"epoch": 0.17493508268416016,
"grad_norm": 0.07163646650797828,
"learning_rate": 4.704452585599549e-06,
"loss": 1.2556,
"mean_token_accuracy": 0.7058372139930725,
"num_tokens": 118281157.0,
"step": 1280
},
{
"entropy": 1.23984375,
"epoch": 0.17630176301763018,
"grad_norm": 0.08527897128568629,
"learning_rate": 4.700929970410033e-06,
"loss": 1.2427,
"mean_token_accuracy": 0.7098132967948914,
"num_tokens": 119199903.0,
"step": 1290
},
{
"entropy": 1.22734375,
"epoch": 0.17766844335110019,
"grad_norm": 0.0747137498330954,
"learning_rate": 4.697407355220516e-06,
"loss": 1.2154,
"mean_token_accuracy": 0.714475291967392,
"num_tokens": 120153384.0,
"step": 1300
},
{
"entropy": 1.234375,
"epoch": 0.17903512368457017,
"grad_norm": 0.07684885387786991,
"learning_rate": 4.693884740030999e-06,
"loss": 1.2186,
"mean_token_accuracy": 0.7112382173538208,
"num_tokens": 121028690.0,
"step": 1310
},
{
"entropy": 1.2625,
"epoch": 0.18040180401804018,
"grad_norm": 0.07874506739073388,
"learning_rate": 4.690362124841483e-06,
"loss": 1.2569,
"mean_token_accuracy": 0.707000607252121,
"num_tokens": 121950665.0,
"step": 1320
},
{
"entropy": 1.24296875,
"epoch": 0.1817684843515102,
"grad_norm": 0.06978374549447125,
"learning_rate": 4.686839509651966e-06,
"loss": 1.2446,
"mean_token_accuracy": 0.7068430840969085,
"num_tokens": 122875209.0,
"step": 1330
},
{
"entropy": 1.2546875,
"epoch": 0.18313516468498017,
"grad_norm": 0.08087799276814463,
"learning_rate": 4.683316894462449e-06,
"loss": 1.2383,
"mean_token_accuracy": 0.7099708557128906,
"num_tokens": 123817600.0,
"step": 1340
},
{
"entropy": 1.228125,
"epoch": 0.18450184501845018,
"grad_norm": 0.07926575865721339,
"learning_rate": 4.679794279272933e-06,
"loss": 1.2423,
"mean_token_accuracy": 0.7086865067481994,
"num_tokens": 124741536.0,
"step": 1350
},
{
"entropy": 1.2515625,
"epoch": 0.1858685253519202,
"grad_norm": 0.07645397250497793,
"learning_rate": 4.676271664083416e-06,
"loss": 1.2547,
"mean_token_accuracy": 0.7065807282924652,
"num_tokens": 125689703.0,
"step": 1360
},
{
"entropy": 1.29921875,
"epoch": 0.18723520568539018,
"grad_norm": 0.10883191872571155,
"learning_rate": 4.6727490488938995e-06,
"loss": 1.3019,
"mean_token_accuracy": 0.6984774529933929,
"num_tokens": 126565228.0,
"step": 1370
},
{
"entropy": 1.24375,
"epoch": 0.1886018860188602,
"grad_norm": 0.0662168165896254,
"learning_rate": 4.669226433704382e-06,
"loss": 1.238,
"mean_token_accuracy": 0.7095187544822693,
"num_tokens": 127484119.0,
"step": 1380
},
{
"entropy": 1.2171875,
"epoch": 0.1899685663523302,
"grad_norm": 0.07364321880963538,
"learning_rate": 4.665703818514866e-06,
"loss": 1.2119,
"mean_token_accuracy": 0.7168352484703064,
"num_tokens": 128382056.0,
"step": 1390
},
{
"entropy": 1.23828125,
"epoch": 0.19133524668580018,
"grad_norm": 0.06856405306846797,
"learning_rate": 4.6621812033253484e-06,
"loss": 1.2363,
"mean_token_accuracy": 0.7104978263378143,
"num_tokens": 129302334.0,
"step": 1400
},
{
"entropy": 1.15078125,
"epoch": 0.1927019270192702,
"grad_norm": 0.07910644437082089,
"learning_rate": 4.658658588135833e-06,
"loss": 1.1408,
"mean_token_accuracy": 0.7266946077346802,
"num_tokens": 130215453.0,
"step": 1410
},
{
"entropy": 1.1953125,
"epoch": 0.1940686073527402,
"grad_norm": 0.07454391462475163,
"learning_rate": 4.6551359729463155e-06,
"loss": 1.1812,
"mean_token_accuracy": 0.7201046526432038,
"num_tokens": 131191399.0,
"step": 1420
},
{
"entropy": 1.26328125,
"epoch": 0.19543528768621019,
"grad_norm": 0.07633654228468086,
"learning_rate": 4.651613357756799e-06,
"loss": 1.2768,
"mean_token_accuracy": 0.7004533767700195,
"num_tokens": 132122523.0,
"step": 1430
},
{
"entropy": 1.26953125,
"epoch": 0.1968019680196802,
"grad_norm": 0.07928634337278012,
"learning_rate": 4.648090742567283e-06,
"loss": 1.2811,
"mean_token_accuracy": 0.7026191055774689,
"num_tokens": 133055902.0,
"step": 1440
},
{
"entropy": 1.23671875,
"epoch": 0.1981686483531502,
"grad_norm": 0.07755302820647716,
"learning_rate": 4.644568127377765e-06,
"loss": 1.238,
"mean_token_accuracy": 0.7086126446723938,
"num_tokens": 134007365.0,
"step": 1450
},
{
"entropy": 1.23359375,
"epoch": 0.1995353286866202,
"grad_norm": 0.07548950749558703,
"learning_rate": 4.641045512188249e-06,
"loss": 1.2368,
"mean_token_accuracy": 0.7100744724273682,
"num_tokens": 134939937.0,
"step": 1460
},
{
"entropy": 1.2,
"epoch": 0.2009020090200902,
"grad_norm": 0.07736508227865796,
"learning_rate": 4.6375228969987324e-06,
"loss": 1.1992,
"mean_token_accuracy": 0.7154443264007568,
"num_tokens": 135883157.0,
"step": 1470
},
{
"entropy": 1.2421875,
"epoch": 0.2022686893535602,
"grad_norm": 0.07664901875187753,
"learning_rate": 4.634000281809216e-06,
"loss": 1.2321,
"mean_token_accuracy": 0.7113223552703858,
"num_tokens": 136826502.0,
"step": 1480
},
{
"entropy": 1.2140625,
"epoch": 0.2036353696870302,
"grad_norm": 0.0939828014936613,
"learning_rate": 4.630477666619699e-06,
"loss": 1.205,
"mean_token_accuracy": 0.7153442561626434,
"num_tokens": 137729172.0,
"step": 1490
},
{
"entropy": 1.1953125,
"epoch": 0.2050020500205002,
"grad_norm": 0.07252808969509232,
"learning_rate": 4.626955051430182e-06,
"loss": 1.197,
"mean_token_accuracy": 0.7176209568977356,
"num_tokens": 138673916.0,
"step": 1500
},
{
"entropy": 1.27421875,
"epoch": 0.20636873035397021,
"grad_norm": 0.07800134169387124,
"learning_rate": 4.623432436240665e-06,
"loss": 1.2646,
"mean_token_accuracy": 0.7045609295368195,
"num_tokens": 139533756.0,
"step": 1510
},
{
"entropy": 1.22265625,
"epoch": 0.2077354106874402,
"grad_norm": 0.07348005013832411,
"learning_rate": 4.619909821051149e-06,
"loss": 1.212,
"mean_token_accuracy": 0.7133168697357177,
"num_tokens": 140388113.0,
"step": 1520
},
{
"entropy": 1.22890625,
"epoch": 0.2091020910209102,
"grad_norm": 0.10307166093969025,
"learning_rate": 4.616387205861632e-06,
"loss": 1.2331,
"mean_token_accuracy": 0.708920705318451,
"num_tokens": 141300644.0,
"step": 1530
},
{
"entropy": 1.2046875,
"epoch": 0.21046877135438022,
"grad_norm": 0.07452734067694752,
"learning_rate": 4.6128645906721156e-06,
"loss": 1.2009,
"mean_token_accuracy": 0.7151067554950714,
"num_tokens": 142178254.0,
"step": 1540
},
{
"entropy": 1.27734375,
"epoch": 0.2118354516878502,
"grad_norm": 0.0767828190701009,
"learning_rate": 4.609341975482598e-06,
"loss": 1.2752,
"mean_token_accuracy": 0.7045287549495697,
"num_tokens": 143088982.0,
"step": 1550
},
{
"entropy": 1.2375,
"epoch": 0.2132021320213202,
"grad_norm": 0.07657474416653137,
"learning_rate": 4.605819360293082e-06,
"loss": 1.2332,
"mean_token_accuracy": 0.7100919723510742,
"num_tokens": 144044472.0,
"step": 1560
},
{
"entropy": 1.18515625,
"epoch": 0.21456881235479022,
"grad_norm": 0.06898712502921248,
"learning_rate": 4.602296745103565e-06,
"loss": 1.1881,
"mean_token_accuracy": 0.7193587481975555,
"num_tokens": 144956757.0,
"step": 1570
},
{
"entropy": 1.24921875,
"epoch": 0.2159354926882602,
"grad_norm": 0.07709254122850251,
"learning_rate": 4.598774129914049e-06,
"loss": 1.2675,
"mean_token_accuracy": 0.703972727060318,
"num_tokens": 145879897.0,
"step": 1580
},
{
"entropy": 1.24296875,
"epoch": 0.21730217302173022,
"grad_norm": 0.08379424321348376,
"learning_rate": 4.595251514724532e-06,
"loss": 1.2285,
"mean_token_accuracy": 0.7085557639598846,
"num_tokens": 146783735.0,
"step": 1590
},
{
"entropy": 1.22109375,
"epoch": 0.21866885335520023,
"grad_norm": 0.06565377413400253,
"learning_rate": 4.591728899535015e-06,
"loss": 1.2018,
"mean_token_accuracy": 0.7154344916343689,
"num_tokens": 147702553.0,
"step": 1600
},
{
"entropy": 1.18515625,
"epoch": 0.2200355336886702,
"grad_norm": 0.13450180924705082,
"learning_rate": 4.588206284345499e-06,
"loss": 1.1791,
"mean_token_accuracy": 0.7195065975189209,
"num_tokens": 148628418.0,
"step": 1610
},
{
"entropy": 1.29921875,
"epoch": 0.22140221402214022,
"grad_norm": 0.08775177824072206,
"learning_rate": 4.584683669155981e-06,
"loss": 1.3021,
"mean_token_accuracy": 0.6977591395378113,
"num_tokens": 149568000.0,
"step": 1620
},
{
"entropy": 1.26328125,
"epoch": 0.22276889435561023,
"grad_norm": 0.0767782591852522,
"learning_rate": 4.581161053966465e-06,
"loss": 1.2638,
"mean_token_accuracy": 0.7059417605400086,
"num_tokens": 150546517.0,
"step": 1630
},
{
"entropy": 1.28828125,
"epoch": 0.2241355746890802,
"grad_norm": 0.08234708126515454,
"learning_rate": 4.5776384387769485e-06,
"loss": 1.2992,
"mean_token_accuracy": 0.7004118978977203,
"num_tokens": 151479506.0,
"step": 1640
},
{
"entropy": 1.2171875,
"epoch": 0.22550225502255022,
"grad_norm": 0.0732060879446106,
"learning_rate": 4.574115823587432e-06,
"loss": 1.2079,
"mean_token_accuracy": 0.7160106897354126,
"num_tokens": 152405951.0,
"step": 1650
},
{
"entropy": 1.221875,
"epoch": 0.22686893535602023,
"grad_norm": 0.07378525941364163,
"learning_rate": 4.570593208397915e-06,
"loss": 1.2282,
"mean_token_accuracy": 0.7123442411422729,
"num_tokens": 153325504.0,
"step": 1660
},
{
"entropy": 1.2328125,
"epoch": 0.22823561568949022,
"grad_norm": 0.07252988225854161,
"learning_rate": 4.567070593208398e-06,
"loss": 1.2241,
"mean_token_accuracy": 0.711389821767807,
"num_tokens": 154226518.0,
"step": 1670
},
{
"entropy": 1.22109375,
"epoch": 0.22960229602296023,
"grad_norm": 0.07785297094287891,
"learning_rate": 4.563547978018882e-06,
"loss": 1.2212,
"mean_token_accuracy": 0.7110374689102172,
"num_tokens": 155166706.0,
"step": 1680
},
{
"entropy": 1.18515625,
"epoch": 0.23096897635643024,
"grad_norm": 0.06868505415853472,
"learning_rate": 4.560025362829365e-06,
"loss": 1.1804,
"mean_token_accuracy": 0.718169915676117,
"num_tokens": 156080529.0,
"step": 1690
},
{
"entropy": 1.221875,
"epoch": 0.23233565668990022,
"grad_norm": 0.07223236411010474,
"learning_rate": 4.556502747639848e-06,
"loss": 1.2291,
"mean_token_accuracy": 0.7113784909248352,
"num_tokens": 157016731.0,
"step": 1700
},
{
"entropy": 1.23125,
"epoch": 0.23370233702337023,
"grad_norm": 0.07350305802838646,
"learning_rate": 4.552980132450332e-06,
"loss": 1.2296,
"mean_token_accuracy": 0.710027402639389,
"num_tokens": 157986519.0,
"step": 1710
},
{
"entropy": 1.20625,
"epoch": 0.23506901735684024,
"grad_norm": 0.07037792447198309,
"learning_rate": 4.549457517260814e-06,
"loss": 1.2041,
"mean_token_accuracy": 0.7160946309566498,
"num_tokens": 158934641.0,
"step": 1720
},
{
"entropy": 1.2,
"epoch": 0.23643569769031023,
"grad_norm": 0.08156353720805053,
"learning_rate": 4.545934902071298e-06,
"loss": 1.2041,
"mean_token_accuracy": 0.7151033580303192,
"num_tokens": 159912974.0,
"step": 1730
},
{
"entropy": 1.2609375,
"epoch": 0.23780237802378024,
"grad_norm": 0.08874992048779999,
"learning_rate": 4.5424122868817814e-06,
"loss": 1.2628,
"mean_token_accuracy": 0.704311466217041,
"num_tokens": 160803757.0,
"step": 1740
},
{
"entropy": 1.1875,
"epoch": 0.23916905835725025,
"grad_norm": 0.0764948173222749,
"learning_rate": 4.538889671692265e-06,
"loss": 1.1821,
"mean_token_accuracy": 0.7199183523654937,
"num_tokens": 161701172.0,
"step": 1750
},
{
"entropy": 1.19765625,
"epoch": 0.24053573869072023,
"grad_norm": 0.0746568654258214,
"learning_rate": 4.535367056502748e-06,
"loss": 1.1915,
"mean_token_accuracy": 0.7167064428329468,
"num_tokens": 162650956.0,
"step": 1760
},
{
"entropy": 1.24140625,
"epoch": 0.24190241902419024,
"grad_norm": 0.07089744525859262,
"learning_rate": 4.531844441313231e-06,
"loss": 1.2381,
"mean_token_accuracy": 0.7088140547275543,
"num_tokens": 163571702.0,
"step": 1770
},
{
"entropy": 1.2,
"epoch": 0.24326909935766025,
"grad_norm": 0.0701772369496898,
"learning_rate": 4.528321826123715e-06,
"loss": 1.1986,
"mean_token_accuracy": 0.7168674647808075,
"num_tokens": 164471607.0,
"step": 1780
},
{
"entropy": 1.23125,
"epoch": 0.24463577969113023,
"grad_norm": 0.07052826014698582,
"learning_rate": 4.524799210934198e-06,
"loss": 1.229,
"mean_token_accuracy": 0.7107847332954407,
"num_tokens": 165399554.0,
"step": 1790
},
{
"entropy": 1.22890625,
"epoch": 0.24600246002460024,
"grad_norm": 0.07058864319365593,
"learning_rate": 4.521276595744681e-06,
"loss": 1.2165,
"mean_token_accuracy": 0.7140754640102387,
"num_tokens": 166291849.0,
"step": 1800
},
{
"entropy": 1.25390625,
"epoch": 0.24736914035807026,
"grad_norm": 0.06919326171107694,
"learning_rate": 4.5177539805551646e-06,
"loss": 1.266,
"mean_token_accuracy": 0.7049540102481842,
"num_tokens": 167206313.0,
"step": 1810
},
{
"entropy": 1.2125,
"epoch": 0.24873582069154024,
"grad_norm": 0.1084347744135834,
"learning_rate": 4.514231365365648e-06,
"loss": 1.2188,
"mean_token_accuracy": 0.7117671549320221,
"num_tokens": 168088688.0,
"step": 1820
},
{
"entropy": 1.2109375,
"epoch": 0.25010250102501025,
"grad_norm": 0.07146671650649467,
"learning_rate": 4.510708750176131e-06,
"loss": 1.191,
"mean_token_accuracy": 0.718199622631073,
"num_tokens": 168991287.0,
"step": 1830
},
{
"entropy": 1.19765625,
"epoch": 0.25146918135848023,
"grad_norm": 0.11636056881371164,
"learning_rate": 4.507186134986614e-06,
"loss": 1.192,
"mean_token_accuracy": 0.7183267593383789,
"num_tokens": 169961051.0,
"step": 1840
},
{
"entropy": 1.203125,
"epoch": 0.25283586169195027,
"grad_norm": 0.0676788489628212,
"learning_rate": 4.503663519797098e-06,
"loss": 1.2048,
"mean_token_accuracy": 0.713425999879837,
"num_tokens": 170889299.0,
"step": 1850
},
{
"entropy": 1.209375,
"epoch": 0.25420254202542025,
"grad_norm": 0.07021962374603712,
"learning_rate": 4.5001409046075814e-06,
"loss": 1.2128,
"mean_token_accuracy": 0.7152886390686035,
"num_tokens": 171801507.0,
"step": 1860
},
{
"entropy": 1.1828125,
"epoch": 0.25556922235889024,
"grad_norm": 0.08548461179935625,
"learning_rate": 4.496618289418064e-06,
"loss": 1.1732,
"mean_token_accuracy": 0.721614420413971,
"num_tokens": 172756808.0,
"step": 1870
},
{
"entropy": 1.215625,
"epoch": 0.2569359026923603,
"grad_norm": 0.07949500879506492,
"learning_rate": 4.493095674228548e-06,
"loss": 1.2077,
"mean_token_accuracy": 0.7160881042480469,
"num_tokens": 173650984.0,
"step": 1880
},
{
"entropy": 1.228125,
"epoch": 0.25830258302583026,
"grad_norm": 0.12516038074498204,
"learning_rate": 4.48957305903903e-06,
"loss": 1.2272,
"mean_token_accuracy": 0.7109438419342041,
"num_tokens": 174590986.0,
"step": 1890
},
{
"entropy": 1.234375,
"epoch": 0.25966926335930024,
"grad_norm": 0.0753483432373505,
"learning_rate": 4.486050443849515e-06,
"loss": 1.2325,
"mean_token_accuracy": 0.711800217628479,
"num_tokens": 175502905.0,
"step": 1900
},
{
"entropy": 1.20703125,
"epoch": 0.2610359436927703,
"grad_norm": 0.08047796344501396,
"learning_rate": 4.4825278286599975e-06,
"loss": 1.2128,
"mean_token_accuracy": 0.7141524672508239,
"num_tokens": 176434918.0,
"step": 1910
},
{
"entropy": 1.16953125,
"epoch": 0.26240262402624026,
"grad_norm": 0.07606733472258645,
"learning_rate": 4.479005213470481e-06,
"loss": 1.172,
"mean_token_accuracy": 0.7225373744964599,
"num_tokens": 177351387.0,
"step": 1920
},
{
"entropy": 1.2234375,
"epoch": 0.26376930435971024,
"grad_norm": 0.07848285419556647,
"learning_rate": 4.475482598280964e-06,
"loss": 1.2153,
"mean_token_accuracy": 0.7116812825202942,
"num_tokens": 178273072.0,
"step": 1930
},
{
"entropy": 1.1953125,
"epoch": 0.2651359846931803,
"grad_norm": 0.06775195730623412,
"learning_rate": 4.471959983091447e-06,
"loss": 1.1942,
"mean_token_accuracy": 0.7178191423416138,
"num_tokens": 179201344.0,
"step": 1940
},
{
"entropy": 1.2234375,
"epoch": 0.26650266502665027,
"grad_norm": 0.07475608978192656,
"learning_rate": 4.468437367901931e-06,
"loss": 1.235,
"mean_token_accuracy": 0.7118826448917389,
"num_tokens": 180128488.0,
"step": 1950
},
{
"entropy": 1.24765625,
"epoch": 0.26786934536012025,
"grad_norm": 0.14051062364917996,
"learning_rate": 4.464914752712414e-06,
"loss": 1.2417,
"mean_token_accuracy": 0.7098841309547425,
"num_tokens": 181044049.0,
"step": 1960
},
{
"entropy": 1.17109375,
"epoch": 0.2692360256935903,
"grad_norm": 0.0695131432912705,
"learning_rate": 4.461392137522897e-06,
"loss": 1.1714,
"mean_token_accuracy": 0.7219539880752563,
"num_tokens": 181988674.0,
"step": 1970
},
{
"entropy": 1.16875,
"epoch": 0.27060270602706027,
"grad_norm": 0.07179003635194427,
"learning_rate": 4.457869522333381e-06,
"loss": 1.1566,
"mean_token_accuracy": 0.7240561246871948,
"num_tokens": 182885151.0,
"step": 1980
},
{
"entropy": 1.2046875,
"epoch": 0.27196938636053025,
"grad_norm": 0.15656239814433573,
"learning_rate": 4.454346907143864e-06,
"loss": 1.2061,
"mean_token_accuracy": 0.7177169501781464,
"num_tokens": 183799094.0,
"step": 1990
},
{
"entropy": 1.234375,
"epoch": 0.2733360666940003,
"grad_norm": 0.07754115914888177,
"learning_rate": 4.450824291954347e-06,
"loss": 1.2345,
"mean_token_accuracy": 0.7113010823726654,
"num_tokens": 184737383.0,
"step": 2000
},
{
"entropy": 1.1765625,
"epoch": 0.2747027470274703,
"grad_norm": 0.06281388978597299,
"learning_rate": 4.44730167676483e-06,
"loss": 1.183,
"mean_token_accuracy": 0.7203613877296448,
"num_tokens": 185666325.0,
"step": 2010
},
{
"entropy": 1.2125,
"epoch": 0.27606942736094026,
"grad_norm": 0.08169312235422904,
"learning_rate": 4.443779061575314e-06,
"loss": 1.2121,
"mean_token_accuracy": 0.7137266814708709,
"num_tokens": 186596675.0,
"step": 2020
},
{
"entropy": 1.180859375,
"epoch": 0.2774361076944103,
"grad_norm": 0.09770655565469033,
"learning_rate": 4.4402564463857975e-06,
"loss": 1.1754,
"mean_token_accuracy": 0.7193385362625122,
"num_tokens": 187482907.0,
"step": 2030
},
{
"entropy": 1.2078125,
"epoch": 0.2788027880278803,
"grad_norm": 0.07320705983308669,
"learning_rate": 4.43673383119628e-06,
"loss": 1.2218,
"mean_token_accuracy": 0.7123588800430298,
"num_tokens": 188385774.0,
"step": 2040
},
{
"entropy": 1.190625,
"epoch": 0.28016946836135026,
"grad_norm": 0.07461998453023624,
"learning_rate": 4.433211216006764e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.7170797526836395,
"num_tokens": 189277151.0,
"step": 2050
},
{
"entropy": 1.196875,
"epoch": 0.2815361486948203,
"grad_norm": 0.07351334088577238,
"learning_rate": 4.4296886008172465e-06,
"loss": 1.2084,
"mean_token_accuracy": 0.7136577785015106,
"num_tokens": 190184956.0,
"step": 2060
},
{
"entropy": 1.2015625,
"epoch": 0.2829028290282903,
"grad_norm": 0.07543136254433923,
"learning_rate": 4.426165985627731e-06,
"loss": 1.207,
"mean_token_accuracy": 0.7164862215518951,
"num_tokens": 191146194.0,
"step": 2070
},
{
"entropy": 1.16484375,
"epoch": 0.28426950936176026,
"grad_norm": 0.06940352248230017,
"learning_rate": 4.4226433704382136e-06,
"loss": 1.1624,
"mean_token_accuracy": 0.724340295791626,
"num_tokens": 192033840.0,
"step": 2080
},
{
"entropy": 1.193359375,
"epoch": 0.2856361896952303,
"grad_norm": 0.0741735074601755,
"learning_rate": 4.419120755248697e-06,
"loss": 1.1898,
"mean_token_accuracy": 0.7174428999423981,
"num_tokens": 192955176.0,
"step": 2090
},
{
"entropy": 1.284375,
"epoch": 0.2870028700287003,
"grad_norm": 0.07185358282527839,
"learning_rate": 4.41559814005918e-06,
"loss": 1.2947,
"mean_token_accuracy": 0.700280612707138,
"num_tokens": 193892359.0,
"step": 2100
},
{
"entropy": 1.2,
"epoch": 0.28836955036217027,
"grad_norm": 0.07467749156128081,
"learning_rate": 4.412075524869663e-06,
"loss": 1.1996,
"mean_token_accuracy": 0.7174516320228577,
"num_tokens": 194764643.0,
"step": 2110
},
{
"entropy": 1.22578125,
"epoch": 0.2897362306956403,
"grad_norm": 0.07273486855345908,
"learning_rate": 4.408552909680147e-06,
"loss": 1.2216,
"mean_token_accuracy": 0.7141775369644165,
"num_tokens": 195672996.0,
"step": 2120
},
{
"entropy": 1.209375,
"epoch": 0.2911029110291103,
"grad_norm": 0.06810128665906601,
"learning_rate": 4.4050302944906304e-06,
"loss": 1.212,
"mean_token_accuracy": 0.7142085075378418,
"num_tokens": 196608934.0,
"step": 2130
},
{
"entropy": 1.1921875,
"epoch": 0.2924695913625803,
"grad_norm": 0.07541911790420608,
"learning_rate": 4.401507679301113e-06,
"loss": 1.1822,
"mean_token_accuracy": 0.7178063273429871,
"num_tokens": 197498404.0,
"step": 2140
},
{
"entropy": 1.290625,
"epoch": 0.2938362716960503,
"grad_norm": 0.07681434937068568,
"learning_rate": 4.397985064111597e-06,
"loss": 1.2973,
"mean_token_accuracy": 0.6993107557296753,
"num_tokens": 198454218.0,
"step": 2150
},
{
"entropy": 1.2171875,
"epoch": 0.2952029520295203,
"grad_norm": 0.06909488907865802,
"learning_rate": 4.39446244892208e-06,
"loss": 1.2113,
"mean_token_accuracy": 0.7143688261508941,
"num_tokens": 199374348.0,
"step": 2160
},
{
"entropy": 1.1984375,
"epoch": 0.2965696323629903,
"grad_norm": 0.10848628879643217,
"learning_rate": 4.390939833732563e-06,
"loss": 1.2022,
"mean_token_accuracy": 0.71532723903656,
"num_tokens": 200221659.0,
"step": 2170
},
{
"entropy": 1.2125,
"epoch": 0.2979363126964603,
"grad_norm": 0.0749881878003306,
"learning_rate": 4.387417218543047e-06,
"loss": 1.2173,
"mean_token_accuracy": 0.7133815109729766,
"num_tokens": 201155187.0,
"step": 2180
},
{
"entropy": 1.18984375,
"epoch": 0.2993029930299303,
"grad_norm": 0.07324402039139576,
"learning_rate": 4.38389460335353e-06,
"loss": 1.1764,
"mean_token_accuracy": 0.7213638544082641,
"num_tokens": 202067534.0,
"step": 2190
},
{
"entropy": 1.21484375,
"epoch": 0.3006696733634003,
"grad_norm": 0.0654010542204238,
"learning_rate": 4.380371988164014e-06,
"loss": 1.2114,
"mean_token_accuracy": 0.713360172510147,
"num_tokens": 203044126.0,
"step": 2200
},
{
"entropy": 1.183203125,
"epoch": 0.3020363536968703,
"grad_norm": 0.13704559181001616,
"learning_rate": 4.376849372974496e-06,
"loss": 1.1883,
"mean_token_accuracy": 0.7205011188983917,
"num_tokens": 203996905.0,
"step": 2210
},
{
"entropy": 1.18671875,
"epoch": 0.3034030340303403,
"grad_norm": 0.07244182084404707,
"learning_rate": 4.37332675778498e-06,
"loss": 1.1842,
"mean_token_accuracy": 0.7179835259914398,
"num_tokens": 204929875.0,
"step": 2220
},
{
"entropy": 1.19765625,
"epoch": 0.3047697143638103,
"grad_norm": 0.07262695875052605,
"learning_rate": 4.369804142595463e-06,
"loss": 1.2113,
"mean_token_accuracy": 0.7149807453155518,
"num_tokens": 205839908.0,
"step": 2230
},
{
"entropy": 1.21171875,
"epoch": 0.3061363946972803,
"grad_norm": 0.07363928664485664,
"learning_rate": 4.366281527405947e-06,
"loss": 1.2229,
"mean_token_accuracy": 0.7123925805091857,
"num_tokens": 206800232.0,
"step": 2240
},
{
"entropy": 1.1609375,
"epoch": 0.3075030750307503,
"grad_norm": 0.07411457742187767,
"learning_rate": 4.36275891221643e-06,
"loss": 1.1586,
"mean_token_accuracy": 0.7220691919326783,
"num_tokens": 207736757.0,
"step": 2250
},
{
"entropy": 1.1625,
"epoch": 0.3088697553642203,
"grad_norm": 0.07693225832231053,
"learning_rate": 4.359236297026913e-06,
"loss": 1.1618,
"mean_token_accuracy": 0.7228505134582519,
"num_tokens": 208650386.0,
"step": 2260
},
{
"entropy": 1.15625,
"epoch": 0.3102364356976903,
"grad_norm": 0.08026886888120692,
"learning_rate": 4.355713681837396e-06,
"loss": 1.1641,
"mean_token_accuracy": 0.722562599182129,
"num_tokens": 209561366.0,
"step": 2270
},
{
"entropy": 1.1828125,
"epoch": 0.3116031160311603,
"grad_norm": 0.07377344262679715,
"learning_rate": 4.352191066647879e-06,
"loss": 1.1882,
"mean_token_accuracy": 0.7189028680324554,
"num_tokens": 210471900.0,
"step": 2280
},
{
"entropy": 1.2015625,
"epoch": 0.3129697963646303,
"grad_norm": 0.07903365340810742,
"learning_rate": 4.348668451458363e-06,
"loss": 1.2073,
"mean_token_accuracy": 0.7155981659889221,
"num_tokens": 211400549.0,
"step": 2290
},
{
"entropy": 1.21875,
"epoch": 0.31433647669810033,
"grad_norm": 0.0760947092729344,
"learning_rate": 4.3451458362688465e-06,
"loss": 1.2185,
"mean_token_accuracy": 0.7136685132980347,
"num_tokens": 212322958.0,
"step": 2300
},
{
"entropy": 1.23203125,
"epoch": 0.3157031570315703,
"grad_norm": 0.07364035315878105,
"learning_rate": 4.34162322107933e-06,
"loss": 1.2184,
"mean_token_accuracy": 0.7146215200424194,
"num_tokens": 213218481.0,
"step": 2310
},
{
"entropy": 1.2171875,
"epoch": 0.3170698373650403,
"grad_norm": 0.06902409200779218,
"learning_rate": 4.338100605889813e-06,
"loss": 1.2145,
"mean_token_accuracy": 0.7147466719150544,
"num_tokens": 214104458.0,
"step": 2320
},
{
"entropy": 1.165234375,
"epoch": 0.31843651769851034,
"grad_norm": 0.07716745433762566,
"learning_rate": 4.334577990700296e-06,
"loss": 1.1515,
"mean_token_accuracy": 0.7265345990657807,
"num_tokens": 214997474.0,
"step": 2330
},
{
"entropy": 1.24296875,
"epoch": 0.3198031980319803,
"grad_norm": 0.07667773522209877,
"learning_rate": 4.33105537551078e-06,
"loss": 1.2476,
"mean_token_accuracy": 0.7071598768234253,
"num_tokens": 215898945.0,
"step": 2340
},
{
"entropy": 1.16484375,
"epoch": 0.3211698783654503,
"grad_norm": 0.08910745011690177,
"learning_rate": 4.327532760321263e-06,
"loss": 1.1551,
"mean_token_accuracy": 0.7243890643119812,
"num_tokens": 216780044.0,
"step": 2350
},
{
"entropy": 1.20859375,
"epoch": 0.32253655869892034,
"grad_norm": 0.07923488965593228,
"learning_rate": 4.324010145131746e-06,
"loss": 1.2073,
"mean_token_accuracy": 0.7135173380374908,
"num_tokens": 217686514.0,
"step": 2360
},
{
"entropy": 1.20546875,
"epoch": 0.3239032390323903,
"grad_norm": 0.06872249594370022,
"learning_rate": 4.32048752994223e-06,
"loss": 1.2135,
"mean_token_accuracy": 0.7140368580818176,
"num_tokens": 218646157.0,
"step": 2370
},
{
"entropy": 1.221875,
"epoch": 0.3252699193658603,
"grad_norm": 0.09594772585075656,
"learning_rate": 4.316964914752712e-06,
"loss": 1.2202,
"mean_token_accuracy": 0.7130348861217499,
"num_tokens": 219565771.0,
"step": 2380
},
{
"entropy": 1.21328125,
"epoch": 0.32663659969933034,
"grad_norm": 0.07214105230390774,
"learning_rate": 4.313442299563196e-06,
"loss": 1.2141,
"mean_token_accuracy": 0.7148744463920593,
"num_tokens": 220446709.0,
"step": 2390
},
{
"entropy": 1.1890625,
"epoch": 0.3280032800328003,
"grad_norm": 0.07979901155792489,
"learning_rate": 4.3099196843736794e-06,
"loss": 1.1911,
"mean_token_accuracy": 0.7168530225753784,
"num_tokens": 221335909.0,
"step": 2400
},
{
"entropy": 1.203125,
"epoch": 0.3293699603662703,
"grad_norm": 0.0760466259524259,
"learning_rate": 4.306397069184163e-06,
"loss": 1.198,
"mean_token_accuracy": 0.7148430943489075,
"num_tokens": 222251385.0,
"step": 2410
},
{
"entropy": 1.2046875,
"epoch": 0.33073664069974035,
"grad_norm": 0.08737256568902925,
"learning_rate": 4.302874453994646e-06,
"loss": 1.209,
"mean_token_accuracy": 0.7145255386829377,
"num_tokens": 223160354.0,
"step": 2420
},
{
"entropy": 1.19140625,
"epoch": 0.33210332103321033,
"grad_norm": 0.06931616182183084,
"learning_rate": 4.299351838805129e-06,
"loss": 1.1894,
"mean_token_accuracy": 0.7179914057254791,
"num_tokens": 224079742.0,
"step": 2430
},
{
"entropy": 1.21328125,
"epoch": 0.3334700013666803,
"grad_norm": 0.06940578045537497,
"learning_rate": 4.295829223615613e-06,
"loss": 1.2127,
"mean_token_accuracy": 0.7143593609333039,
"num_tokens": 225006893.0,
"step": 2440
},
{
"entropy": 1.215625,
"epoch": 0.33483668170015035,
"grad_norm": 0.07277331001623083,
"learning_rate": 4.292306608426096e-06,
"loss": 1.2089,
"mean_token_accuracy": 0.7144564747810364,
"num_tokens": 225912700.0,
"step": 2450
},
{
"entropy": 1.171875,
"epoch": 0.33620336203362033,
"grad_norm": 0.06865649640590905,
"learning_rate": 4.288783993236579e-06,
"loss": 1.1608,
"mean_token_accuracy": 0.7236881017684936,
"num_tokens": 226846501.0,
"step": 2460
},
{
"entropy": 1.2265625,
"epoch": 0.3375700423670903,
"grad_norm": 0.07986764192347978,
"learning_rate": 4.285261378047063e-06,
"loss": 1.2251,
"mean_token_accuracy": 0.7131475150585175,
"num_tokens": 227775064.0,
"step": 2470
},
{
"entropy": 1.19375,
"epoch": 0.33893672270056036,
"grad_norm": 0.06890779330276539,
"learning_rate": 4.281738762857546e-06,
"loss": 1.196,
"mean_token_accuracy": 0.7150323927402497,
"num_tokens": 228705788.0,
"step": 2480
},
{
"entropy": 1.25390625,
"epoch": 0.34030340303403034,
"grad_norm": 0.07604999138283867,
"learning_rate": 4.278216147668029e-06,
"loss": 1.2489,
"mean_token_accuracy": 0.7065034687519074,
"num_tokens": 229637091.0,
"step": 2490
},
{
"entropy": 1.21328125,
"epoch": 0.3416700833675003,
"grad_norm": 0.07592793101666603,
"learning_rate": 4.274693532478512e-06,
"loss": 1.2114,
"mean_token_accuracy": 0.7130004227161407,
"num_tokens": 230557197.0,
"step": 2500
},
{
"entropy": 1.15703125,
"epoch": 0.34303676370097036,
"grad_norm": 0.06651257910017482,
"learning_rate": 4.271170917288996e-06,
"loss": 1.1575,
"mean_token_accuracy": 0.7247360050678253,
"num_tokens": 231469955.0,
"step": 2510
},
{
"entropy": 1.19453125,
"epoch": 0.34440344403444034,
"grad_norm": 0.07589049129349038,
"learning_rate": 4.2676483020994795e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.7188793301582337,
"num_tokens": 232397901.0,
"step": 2520
},
{
"entropy": 1.20703125,
"epoch": 0.3457701243679103,
"grad_norm": 0.06834806099928026,
"learning_rate": 4.264125686909962e-06,
"loss": 1.217,
"mean_token_accuracy": 0.7136930584907532,
"num_tokens": 233361773.0,
"step": 2530
},
{
"entropy": 1.18984375,
"epoch": 0.34713680470138036,
"grad_norm": 0.07422668338832532,
"learning_rate": 4.260603071720446e-06,
"loss": 1.1949,
"mean_token_accuracy": 0.7171481668949127,
"num_tokens": 234268527.0,
"step": 2540
},
{
"entropy": 1.24296875,
"epoch": 0.34850348503485035,
"grad_norm": 0.07645122150446597,
"learning_rate": 4.257080456530928e-06,
"loss": 1.2456,
"mean_token_accuracy": 0.7077441513538361,
"num_tokens": 235186978.0,
"step": 2550
},
{
"entropy": 1.17109375,
"epoch": 0.34987016536832033,
"grad_norm": 0.07172406495663954,
"learning_rate": 4.253557841341413e-06,
"loss": 1.1698,
"mean_token_accuracy": 0.7216856241226196,
"num_tokens": 236119762.0,
"step": 2560
},
{
"entropy": 1.1828125,
"epoch": 0.35123684570179037,
"grad_norm": 0.09209852283068155,
"learning_rate": 4.2500352261518955e-06,
"loss": 1.187,
"mean_token_accuracy": 0.7167738199234008,
"num_tokens": 237045069.0,
"step": 2570
},
{
"entropy": 1.13984375,
"epoch": 0.35260352603526035,
"grad_norm": 0.07694180404852952,
"learning_rate": 4.246512610962379e-06,
"loss": 1.1389,
"mean_token_accuracy": 0.72796910405159,
"num_tokens": 237966003.0,
"step": 2580
},
{
"entropy": 1.23125,
"epoch": 0.35397020636873033,
"grad_norm": 0.0737421925713148,
"learning_rate": 4.242989995772862e-06,
"loss": 1.2356,
"mean_token_accuracy": 0.7109938859939575,
"num_tokens": 238847792.0,
"step": 2590
},
{
"entropy": 1.20078125,
"epoch": 0.35533688670220037,
"grad_norm": 0.06800427475265554,
"learning_rate": 4.239467380583345e-06,
"loss": 1.204,
"mean_token_accuracy": 0.7154838144779205,
"num_tokens": 239817460.0,
"step": 2600
},
{
"entropy": 1.18203125,
"epoch": 0.35670356703567035,
"grad_norm": 0.06813908530574964,
"learning_rate": 4.235944765393829e-06,
"loss": 1.1736,
"mean_token_accuracy": 0.7216013431549072,
"num_tokens": 240730875.0,
"step": 2610
},
{
"entropy": 1.19375,
"epoch": 0.35807024736914034,
"grad_norm": 0.07576735260458077,
"learning_rate": 4.232422150204312e-06,
"loss": 1.2003,
"mean_token_accuracy": 0.7149185478687287,
"num_tokens": 241618927.0,
"step": 2620
},
{
"entropy": 1.2625,
"epoch": 0.3594369277026104,
"grad_norm": 0.08879555347351553,
"learning_rate": 4.228899535014795e-06,
"loss": 1.2643,
"mean_token_accuracy": 0.7041713893413544,
"num_tokens": 242506323.0,
"step": 2630
},
{
"entropy": 1.19921875,
"epoch": 0.36080360803608036,
"grad_norm": 0.1155615913176657,
"learning_rate": 4.225376919825279e-06,
"loss": 1.2057,
"mean_token_accuracy": 0.7157654762268066,
"num_tokens": 243419790.0,
"step": 2640
},
{
"entropy": 1.183984375,
"epoch": 0.36217028836955034,
"grad_norm": 0.06576301335813085,
"learning_rate": 4.221854304635762e-06,
"loss": 1.175,
"mean_token_accuracy": 0.7191399037837982,
"num_tokens": 244337605.0,
"step": 2650
},
{
"entropy": 1.24375,
"epoch": 0.3635369687030204,
"grad_norm": 0.07085377524441232,
"learning_rate": 4.218331689446245e-06,
"loss": 1.2551,
"mean_token_accuracy": 0.7064870715141296,
"num_tokens": 245269588.0,
"step": 2660
},
{
"entropy": 1.17890625,
"epoch": 0.36490364903649036,
"grad_norm": 0.0746135167292139,
"learning_rate": 4.2148090742567284e-06,
"loss": 1.1733,
"mean_token_accuracy": 0.7232432067394257,
"num_tokens": 246183522.0,
"step": 2670
},
{
"entropy": 1.1828125,
"epoch": 0.36627032936996035,
"grad_norm": 0.0683970915987098,
"learning_rate": 4.211286459067212e-06,
"loss": 1.1957,
"mean_token_accuracy": 0.7146703779697419,
"num_tokens": 247159185.0,
"step": 2680
},
{
"entropy": 1.15703125,
"epoch": 0.3676370097034304,
"grad_norm": 0.07036466627788474,
"learning_rate": 4.2077638438776955e-06,
"loss": 1.1451,
"mean_token_accuracy": 0.7274271607398987,
"num_tokens": 248044696.0,
"step": 2690
},
{
"entropy": 1.17421875,
"epoch": 0.36900369003690037,
"grad_norm": 0.07006146608219765,
"learning_rate": 4.204241228688178e-06,
"loss": 1.1811,
"mean_token_accuracy": 0.720886504650116,
"num_tokens": 249045044.0,
"step": 2700
},
{
"entropy": 1.1859375,
"epoch": 0.37037037037037035,
"grad_norm": 0.07895986978965068,
"learning_rate": 4.200718613498662e-06,
"loss": 1.1878,
"mean_token_accuracy": 0.7188835442066193,
"num_tokens": 249974276.0,
"step": 2710
},
{
"entropy": 1.18203125,
"epoch": 0.3717370507038404,
"grad_norm": 0.07477961733916341,
"learning_rate": 4.1971959983091445e-06,
"loss": 1.1723,
"mean_token_accuracy": 0.7216056048870086,
"num_tokens": 250873544.0,
"step": 2720
},
{
"entropy": 1.15859375,
"epoch": 0.37310373103731037,
"grad_norm": 0.0699736405938027,
"learning_rate": 4.193673383119629e-06,
"loss": 1.1515,
"mean_token_accuracy": 0.724641990661621,
"num_tokens": 251819879.0,
"step": 2730
},
{
"entropy": 1.2421875,
"epoch": 0.37447041137078035,
"grad_norm": 0.07026617306593014,
"learning_rate": 4.190150767930112e-06,
"loss": 1.2438,
"mean_token_accuracy": 0.7089731693267822,
"num_tokens": 252807631.0,
"step": 2740
},
{
"entropy": 1.2375,
"epoch": 0.3758370917042504,
"grad_norm": 0.0726617532622753,
"learning_rate": 4.186628152740595e-06,
"loss": 1.2432,
"mean_token_accuracy": 0.7086890578269959,
"num_tokens": 253747908.0,
"step": 2750
},
{
"entropy": 1.1875,
"epoch": 0.3772037720377204,
"grad_norm": 0.07854195605778609,
"learning_rate": 4.183105537551078e-06,
"loss": 1.1806,
"mean_token_accuracy": 0.719807106256485,
"num_tokens": 254680625.0,
"step": 2760
},
{
"entropy": 1.21953125,
"epoch": 0.37857045237119036,
"grad_norm": 0.08090602398818847,
"learning_rate": 4.179582922361561e-06,
"loss": 1.228,
"mean_token_accuracy": 0.7107358276844025,
"num_tokens": 255573141.0,
"step": 2770
},
{
"entropy": 1.221875,
"epoch": 0.3799371327046604,
"grad_norm": 0.077292079240722,
"learning_rate": 4.176060307172045e-06,
"loss": 1.2149,
"mean_token_accuracy": 0.7120519340038299,
"num_tokens": 256517622.0,
"step": 2780
},
{
"entropy": 1.19375,
"epoch": 0.3813038130381304,
"grad_norm": 0.06840168456916693,
"learning_rate": 4.1725376919825285e-06,
"loss": 1.1846,
"mean_token_accuracy": 0.7197319447994233,
"num_tokens": 257406301.0,
"step": 2790
},
{
"entropy": 1.2015625,
"epoch": 0.38267049337160036,
"grad_norm": 0.07160532100385883,
"learning_rate": 4.169015076793011e-06,
"loss": 1.2048,
"mean_token_accuracy": 0.7156904339790344,
"num_tokens": 258296600.0,
"step": 2800
},
{
"entropy": 1.165625,
"epoch": 0.3840371737050704,
"grad_norm": 0.07274643594828015,
"learning_rate": 4.165492461603495e-06,
"loss": 1.166,
"mean_token_accuracy": 0.7226861894130707,
"num_tokens": 259223724.0,
"step": 2810
},
{
"entropy": 1.18125,
"epoch": 0.3854038540385404,
"grad_norm": 0.12191727322360216,
"learning_rate": 4.161969846413978e-06,
"loss": 1.1844,
"mean_token_accuracy": 0.7197048008441925,
"num_tokens": 260114709.0,
"step": 2820
},
{
"entropy": 1.130078125,
"epoch": 0.38677053437201037,
"grad_norm": 0.06794087212920467,
"learning_rate": 4.158447231224461e-06,
"loss": 1.1339,
"mean_token_accuracy": 0.7287942409515381,
"num_tokens": 261095644.0,
"step": 2830
},
{
"entropy": 1.19609375,
"epoch": 0.3881372147054804,
"grad_norm": 0.06945456231751064,
"learning_rate": 4.1549246160349445e-06,
"loss": 1.1906,
"mean_token_accuracy": 0.7177744567394256,
"num_tokens": 262062947.0,
"step": 2840
},
{
"entropy": 1.18125,
"epoch": 0.3895038950389504,
"grad_norm": 0.0701839662981836,
"learning_rate": 4.151402000845428e-06,
"loss": 1.1819,
"mean_token_accuracy": 0.7184736132621765,
"num_tokens": 263008190.0,
"step": 2850
},
{
"entropy": 1.1734375,
"epoch": 0.39087057537242037,
"grad_norm": 0.07509269468228826,
"learning_rate": 4.147879385655912e-06,
"loss": 1.1645,
"mean_token_accuracy": 0.7234985768795014,
"num_tokens": 263918362.0,
"step": 2860
},
{
"entropy": 1.1890625,
"epoch": 0.3922372557058904,
"grad_norm": 0.08216394020360349,
"learning_rate": 4.144356770466394e-06,
"loss": 1.1823,
"mean_token_accuracy": 0.7182194054126739,
"num_tokens": 264804628.0,
"step": 2870
},
{
"entropy": 1.2640625,
"epoch": 0.3936039360393604,
"grad_norm": 0.07366025397581945,
"learning_rate": 4.140834155276878e-06,
"loss": 1.2632,
"mean_token_accuracy": 0.7045623362064362,
"num_tokens": 265741733.0,
"step": 2880
},
{
"entropy": 1.2140625,
"epoch": 0.3949706163728304,
"grad_norm": 0.07099111861922223,
"learning_rate": 4.137311540087361e-06,
"loss": 1.2173,
"mean_token_accuracy": 0.7146319806575775,
"num_tokens": 266715918.0,
"step": 2890
},
{
"entropy": 1.2328125,
"epoch": 0.3963372967063004,
"grad_norm": 0.07586148237892588,
"learning_rate": 4.133788924897845e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.7118486762046814,
"num_tokens": 267644209.0,
"step": 2900
},
{
"entropy": 1.14921875,
"epoch": 0.3977039770397704,
"grad_norm": 0.07171588081691448,
"learning_rate": 4.130266309708328e-06,
"loss": 1.1472,
"mean_token_accuracy": 0.7277048468589783,
"num_tokens": 268556439.0,
"step": 2910
},
{
"entropy": 1.225,
"epoch": 0.3990706573732404,
"grad_norm": 0.07412769551340335,
"learning_rate": 4.126743694518811e-06,
"loss": 1.2182,
"mean_token_accuracy": 0.7131030261516571,
"num_tokens": 269504566.0,
"step": 2920
},
{
"entropy": 1.19140625,
"epoch": 0.4004373377067104,
"grad_norm": 0.07888023052302628,
"learning_rate": 4.123221079329294e-06,
"loss": 1.1865,
"mean_token_accuracy": 0.7184780895709991,
"num_tokens": 270367741.0,
"step": 2930
},
{
"entropy": 1.203125,
"epoch": 0.4018040180401804,
"grad_norm": 0.07399846515621171,
"learning_rate": 4.1196984641397774e-06,
"loss": 1.2054,
"mean_token_accuracy": 0.7147743821144104,
"num_tokens": 271271744.0,
"step": 2940
},
{
"entropy": 1.17109375,
"epoch": 0.4031706983736504,
"grad_norm": 0.07747130255525912,
"learning_rate": 4.116175848950261e-06,
"loss": 1.1727,
"mean_token_accuracy": 0.7235036253929138,
"num_tokens": 272171745.0,
"step": 2950
},
{
"entropy": 1.18125,
"epoch": 0.4045373787071204,
"grad_norm": 0.08087148187708738,
"learning_rate": 4.1126532337607445e-06,
"loss": 1.1837,
"mean_token_accuracy": 0.7189372599124908,
"num_tokens": 273016295.0,
"step": 2960
},
{
"entropy": 1.209375,
"epoch": 0.4059040590405904,
"grad_norm": 0.08555846812456325,
"learning_rate": 4.109130618571227e-06,
"loss": 1.2047,
"mean_token_accuracy": 0.7166744649410248,
"num_tokens": 273956028.0,
"step": 2970
},
{
"entropy": 1.1984375,
"epoch": 0.4072707393740604,
"grad_norm": 0.07965428277975202,
"learning_rate": 4.105608003381711e-06,
"loss": 1.1877,
"mean_token_accuracy": 0.7162480533123017,
"num_tokens": 274864033.0,
"step": 2980
},
{
"entropy": 1.14765625,
"epoch": 0.4086374197075304,
"grad_norm": 0.16727013035410992,
"learning_rate": 4.102085388192194e-06,
"loss": 1.1396,
"mean_token_accuracy": 0.7275459170341492,
"num_tokens": 275771714.0,
"step": 2990
},
{
"entropy": 1.1875,
"epoch": 0.4100041000410004,
"grad_norm": 0.08386178543080186,
"learning_rate": 4.098562773002678e-06,
"loss": 1.1902,
"mean_token_accuracy": 0.7153613328933716,
"num_tokens": 276671468.0,
"step": 3000
},
{
"entropy": 1.16796875,
"epoch": 0.4113707803744704,
"grad_norm": 0.0688054392361731,
"learning_rate": 4.095040157813161e-06,
"loss": 1.1616,
"mean_token_accuracy": 0.723707240819931,
"num_tokens": 277629191.0,
"step": 3010
},
{
"entropy": 1.20859375,
"epoch": 0.41273746070794043,
"grad_norm": 0.07455381428329927,
"learning_rate": 4.091517542623644e-06,
"loss": 1.2165,
"mean_token_accuracy": 0.7128286778926849,
"num_tokens": 278530380.0,
"step": 3020
},
{
"entropy": 1.24296875,
"epoch": 0.4141041410414104,
"grad_norm": 0.0848033896587222,
"learning_rate": 4.087994927434128e-06,
"loss": 1.2562,
"mean_token_accuracy": 0.7058168113231659,
"num_tokens": 279444562.0,
"step": 3030
},
{
"entropy": 1.20234375,
"epoch": 0.4154708213748804,
"grad_norm": 0.07399371898052384,
"learning_rate": 4.08447231224461e-06,
"loss": 1.2041,
"mean_token_accuracy": 0.714765053987503,
"num_tokens": 280334745.0,
"step": 3040
},
{
"entropy": 1.1296875,
"epoch": 0.41683750170835043,
"grad_norm": 0.07576261768453339,
"learning_rate": 4.080949697055094e-06,
"loss": 1.1375,
"mean_token_accuracy": 0.7268602609634399,
"num_tokens": 281213602.0,
"step": 3050
},
{
"entropy": 1.1609375,
"epoch": 0.4182041820418204,
"grad_norm": 0.07086614974837896,
"learning_rate": 4.0774270818655775e-06,
"loss": 1.1537,
"mean_token_accuracy": 0.726021945476532,
"num_tokens": 282155704.0,
"step": 3060
},
{
"entropy": 1.22578125,
"epoch": 0.4195708623752904,
"grad_norm": 0.07964057784787638,
"learning_rate": 4.073904466676061e-06,
"loss": 1.2244,
"mean_token_accuracy": 0.7112350463867188,
"num_tokens": 283060115.0,
"step": 3070
},
{
"entropy": 1.20390625,
"epoch": 0.42093754270876044,
"grad_norm": 0.08072927431592543,
"learning_rate": 4.070381851486544e-06,
"loss": 1.2127,
"mean_token_accuracy": 0.7155052840709686,
"num_tokens": 284015383.0,
"step": 3080
},
{
"entropy": 1.17265625,
"epoch": 0.4223042230422304,
"grad_norm": 0.07318843049497996,
"learning_rate": 4.066859236297027e-06,
"loss": 1.174,
"mean_token_accuracy": 0.72162926197052,
"num_tokens": 284957286.0,
"step": 3090
},
{
"entropy": 1.1953125,
"epoch": 0.4236709033757004,
"grad_norm": 0.0781440388665331,
"learning_rate": 4.06333662110751e-06,
"loss": 1.1985,
"mean_token_accuracy": 0.7175770282745362,
"num_tokens": 285849739.0,
"step": 3100
},
{
"entropy": 1.228125,
"epoch": 0.42503758370917044,
"grad_norm": 0.0737479883871913,
"learning_rate": 4.059814005917994e-06,
"loss": 1.2341,
"mean_token_accuracy": 0.7115173637866974,
"num_tokens": 286816194.0,
"step": 3110
},
{
"entropy": 1.19453125,
"epoch": 0.4264042640426404,
"grad_norm": 0.06735520083436816,
"learning_rate": 4.056291390728477e-06,
"loss": 1.1865,
"mean_token_accuracy": 0.7199340164661407,
"num_tokens": 287733305.0,
"step": 3120
},
{
"entropy": 1.24765625,
"epoch": 0.4277709443761104,
"grad_norm": 0.08243184445833185,
"learning_rate": 4.052768775538961e-06,
"loss": 1.2576,
"mean_token_accuracy": 0.7084418177604676,
"num_tokens": 288676265.0,
"step": 3130
},
{
"entropy": 1.19765625,
"epoch": 0.42913762470958045,
"grad_norm": 0.08042735662707992,
"learning_rate": 4.049246160349444e-06,
"loss": 1.1906,
"mean_token_accuracy": 0.7192962229251861,
"num_tokens": 289628921.0,
"step": 3140
},
{
"entropy": 1.18203125,
"epoch": 0.43050430504305043,
"grad_norm": 0.06697435257914378,
"learning_rate": 4.045723545159927e-06,
"loss": 1.1808,
"mean_token_accuracy": 0.720278400182724,
"num_tokens": 290508054.0,
"step": 3150
},
{
"entropy": 1.1390625,
"epoch": 0.4318709853765204,
"grad_norm": 0.0687638128876142,
"learning_rate": 4.04220092997041e-06,
"loss": 1.143,
"mean_token_accuracy": 0.7266555547714233,
"num_tokens": 291445061.0,
"step": 3160
},
{
"entropy": 1.175,
"epoch": 0.43323766570999045,
"grad_norm": 0.19974157918127827,
"learning_rate": 4.038678314780894e-06,
"loss": 1.1673,
"mean_token_accuracy": 0.7230919480323792,
"num_tokens": 292415043.0,
"step": 3170
},
{
"entropy": 1.17890625,
"epoch": 0.43460434604346043,
"grad_norm": 0.0710157678050328,
"learning_rate": 4.0351556995913775e-06,
"loss": 1.1831,
"mean_token_accuracy": 0.7181249678134918,
"num_tokens": 293378677.0,
"step": 3180
},
{
"entropy": 1.17265625,
"epoch": 0.4359710263769304,
"grad_norm": 0.06985044028176071,
"learning_rate": 4.03163308440186e-06,
"loss": 1.176,
"mean_token_accuracy": 0.7207567274570466,
"num_tokens": 294289221.0,
"step": 3190
},
{
"entropy": 1.2140625,
"epoch": 0.43733770671040045,
"grad_norm": 0.08016092230756136,
"learning_rate": 4.028110469212344e-06,
"loss": 1.2154,
"mean_token_accuracy": 0.715403139591217,
"num_tokens": 295166106.0,
"step": 3200
},
{
"entropy": 1.190625,
"epoch": 0.43870438704387044,
"grad_norm": 0.06959970058083874,
"learning_rate": 4.0245878540228264e-06,
"loss": 1.1861,
"mean_token_accuracy": 0.7193870127201081,
"num_tokens": 296076562.0,
"step": 3210
},
{
"entropy": 1.23984375,
"epoch": 0.4400710673773404,
"grad_norm": 0.07616947117741496,
"learning_rate": 4.02106523883331e-06,
"loss": 1.2354,
"mean_token_accuracy": 0.7107770264148712,
"num_tokens": 296967830.0,
"step": 3220
},
{
"entropy": 1.16015625,
"epoch": 0.44143774771081046,
"grad_norm": 0.07967006549833325,
"learning_rate": 4.0175426236437935e-06,
"loss": 1.1686,
"mean_token_accuracy": 0.7238077223300934,
"num_tokens": 297854033.0,
"step": 3230
},
{
"entropy": 1.1703125,
"epoch": 0.44280442804428044,
"grad_norm": 0.08679518751088995,
"learning_rate": 4.014020008454277e-06,
"loss": 1.1762,
"mean_token_accuracy": 0.7209996938705444,
"num_tokens": 298775899.0,
"step": 3240
},
{
"entropy": 1.15859375,
"epoch": 0.4441711083777504,
"grad_norm": 0.07128480690163845,
"learning_rate": 4.01049739326476e-06,
"loss": 1.1476,
"mean_token_accuracy": 0.7240599095821381,
"num_tokens": 299644998.0,
"step": 3250
},
{
"entropy": 1.196875,
"epoch": 0.44553778871122046,
"grad_norm": 0.1305433307180616,
"learning_rate": 4.006974778075243e-06,
"loss": 1.1936,
"mean_token_accuracy": 0.7183843851089478,
"num_tokens": 300579588.0,
"step": 3260
},
{
"entropy": 1.20078125,
"epoch": 0.44690446904469044,
"grad_norm": 0.07053308510879722,
"learning_rate": 4.003452162885727e-06,
"loss": 1.2027,
"mean_token_accuracy": 0.7146745204925538,
"num_tokens": 301539377.0,
"step": 3270
},
{
"entropy": 1.13828125,
"epoch": 0.4482711493781604,
"grad_norm": 0.07848151323549829,
"learning_rate": 3.99992954769621e-06,
"loss": 1.149,
"mean_token_accuracy": 0.7264383137226105,
"num_tokens": 302459676.0,
"step": 3280
},
{
"entropy": 1.1453125,
"epoch": 0.44963782971163047,
"grad_norm": 0.06649170722440903,
"learning_rate": 3.996406932506693e-06,
"loss": 1.1411,
"mean_token_accuracy": 0.72639981508255,
"num_tokens": 303424014.0,
"step": 3290
},
{
"entropy": 1.125,
"epoch": 0.45100451004510045,
"grad_norm": 0.07324956572787877,
"learning_rate": 3.992884317317177e-06,
"loss": 1.1245,
"mean_token_accuracy": 0.7300680994987487,
"num_tokens": 304396334.0,
"step": 3300
},
{
"entropy": 1.23515625,
"epoch": 0.45237119037857043,
"grad_norm": 0.08292355054174534,
"learning_rate": 3.98936170212766e-06,
"loss": 1.2436,
"mean_token_accuracy": 0.7100316464900971,
"num_tokens": 305340507.0,
"step": 3310
},
{
"entropy": 1.140625,
"epoch": 0.45373787071204047,
"grad_norm": 0.07112962172675508,
"learning_rate": 3.985839086938143e-06,
"loss": 1.1342,
"mean_token_accuracy": 0.7280888974666595,
"num_tokens": 306253804.0,
"step": 3320
},
{
"entropy": 1.184375,
"epoch": 0.45510455104551045,
"grad_norm": 0.07862062495698585,
"learning_rate": 3.9823164717486265e-06,
"loss": 1.1912,
"mean_token_accuracy": 0.7182447850704193,
"num_tokens": 307219655.0,
"step": 3330
},
{
"entropy": 1.1796875,
"epoch": 0.45647123137898044,
"grad_norm": 0.07667566861967837,
"learning_rate": 3.97879385655911e-06,
"loss": 1.1836,
"mean_token_accuracy": 0.7191243052482605,
"num_tokens": 308120248.0,
"step": 3340
},
{
"entropy": 1.20390625,
"epoch": 0.4578379117124505,
"grad_norm": 0.07981357020868199,
"learning_rate": 3.9752712413695936e-06,
"loss": 1.2056,
"mean_token_accuracy": 0.7153179526329041,
"num_tokens": 309033220.0,
"step": 3350
},
{
"entropy": 1.24765625,
"epoch": 0.45920459204592046,
"grad_norm": 0.07726123855019383,
"learning_rate": 3.971748626180076e-06,
"loss": 1.2515,
"mean_token_accuracy": 0.7064828991889953,
"num_tokens": 309966122.0,
"step": 3360
},
{
"entropy": 1.18359375,
"epoch": 0.46057127237939044,
"grad_norm": 0.07529026207816364,
"learning_rate": 3.96822601099056e-06,
"loss": 1.1775,
"mean_token_accuracy": 0.7205836296081543,
"num_tokens": 310927570.0,
"step": 3370
},
{
"entropy": 1.190625,
"epoch": 0.4619379527128605,
"grad_norm": 0.07363205450635169,
"learning_rate": 3.9647033958010425e-06,
"loss": 1.2061,
"mean_token_accuracy": 0.7147302210330964,
"num_tokens": 311832124.0,
"step": 3380
},
{
"entropy": 1.1828125,
"epoch": 0.46330463304633046,
"grad_norm": 0.07684475018905121,
"learning_rate": 3.961180780611527e-06,
"loss": 1.1848,
"mean_token_accuracy": 0.7168917119503021,
"num_tokens": 312752210.0,
"step": 3390
},
{
"entropy": 1.21875,
"epoch": 0.46467131337980044,
"grad_norm": 0.06659524402225907,
"learning_rate": 3.95765816542201e-06,
"loss": 1.2144,
"mean_token_accuracy": 0.7148098886013031,
"num_tokens": 313721040.0,
"step": 3400
},
{
"entropy": 1.133203125,
"epoch": 0.4660379937132705,
"grad_norm": 0.07371440032624303,
"learning_rate": 3.954135550232493e-06,
"loss": 1.1255,
"mean_token_accuracy": 0.7287675261497497,
"num_tokens": 314623635.0,
"step": 3410
},
{
"entropy": 1.20546875,
"epoch": 0.46740467404674046,
"grad_norm": 0.07821691108551296,
"learning_rate": 3.950612935042976e-06,
"loss": 1.202,
"mean_token_accuracy": 0.7171783745288849,
"num_tokens": 315549717.0,
"step": 3420
},
{
"entropy": 1.1546875,
"epoch": 0.46877135438021045,
"grad_norm": 0.07547007071434032,
"learning_rate": 3.947090319853459e-06,
"loss": 1.1457,
"mean_token_accuracy": 0.7260000824928283,
"num_tokens": 316459223.0,
"step": 3430
},
{
"entropy": 1.24375,
"epoch": 0.4701380347136805,
"grad_norm": 0.07938397284037002,
"learning_rate": 3.943567704663943e-06,
"loss": 1.2389,
"mean_token_accuracy": 0.7105916500091553,
"num_tokens": 317359360.0,
"step": 3440
},
{
"entropy": 1.1796875,
"epoch": 0.47150471504715047,
"grad_norm": 0.09298506387573126,
"learning_rate": 3.9400450894744265e-06,
"loss": 1.1758,
"mean_token_accuracy": 0.7203917145729065,
"num_tokens": 318239868.0,
"step": 3450
},
{
"entropy": 1.23984375,
"epoch": 0.47287139538062045,
"grad_norm": 0.07626633133475272,
"learning_rate": 3.936522474284909e-06,
"loss": 1.2575,
"mean_token_accuracy": 0.7067981064319611,
"num_tokens": 319176882.0,
"step": 3460
},
{
"entropy": 1.18125,
"epoch": 0.4742380757140905,
"grad_norm": 0.07573794064962275,
"learning_rate": 3.932999859095393e-06,
"loss": 1.1892,
"mean_token_accuracy": 0.7190979719161987,
"num_tokens": 320165510.0,
"step": 3470
},
{
"entropy": 1.16015625,
"epoch": 0.4756047560475605,
"grad_norm": 0.07671228725072554,
"learning_rate": 3.929477243905876e-06,
"loss": 1.1527,
"mean_token_accuracy": 0.7267825365066528,
"num_tokens": 321088771.0,
"step": 3480
},
{
"entropy": 1.215625,
"epoch": 0.47697143638103046,
"grad_norm": 0.08629477699175946,
"learning_rate": 3.925954628716359e-06,
"loss": 1.2096,
"mean_token_accuracy": 0.7166905045509339,
"num_tokens": 322002114.0,
"step": 3490
},
{
"entropy": 1.2046875,
"epoch": 0.4783381167145005,
"grad_norm": 0.07830310298377441,
"learning_rate": 3.9224320135268425e-06,
"loss": 1.1983,
"mean_token_accuracy": 0.7166287660598755,
"num_tokens": 322898306.0,
"step": 3500
},
{
"entropy": 1.20546875,
"epoch": 0.4797047970479705,
"grad_norm": 0.07670622943812533,
"learning_rate": 3.918909398337326e-06,
"loss": 1.2027,
"mean_token_accuracy": 0.7165621936321258,
"num_tokens": 323816623.0,
"step": 3510
},
{
"entropy": 1.16796875,
"epoch": 0.48107147738144046,
"grad_norm": 0.07324693416345282,
"learning_rate": 3.91538678314781e-06,
"loss": 1.1638,
"mean_token_accuracy": 0.7246742069721221,
"num_tokens": 324757771.0,
"step": 3520
},
{
"entropy": 1.1953125,
"epoch": 0.4824381577149105,
"grad_norm": 0.07389563311223063,
"learning_rate": 3.911864167958292e-06,
"loss": 1.185,
"mean_token_accuracy": 0.7201698780059814,
"num_tokens": 325695539.0,
"step": 3530
},
{
"entropy": 1.15859375,
"epoch": 0.4838048380483805,
"grad_norm": 0.0727189907583997,
"learning_rate": 3.908341552768776e-06,
"loss": 1.1524,
"mean_token_accuracy": 0.7246108949184418,
"num_tokens": 326619627.0,
"step": 3540
},
{
"entropy": 1.1578125,
"epoch": 0.48517151838185046,
"grad_norm": 0.07452616088346524,
"learning_rate": 3.9048189375792586e-06,
"loss": 1.1585,
"mean_token_accuracy": 0.721977823972702,
"num_tokens": 327522409.0,
"step": 3550
},
{
"entropy": 1.2046875,
"epoch": 0.4865381987153205,
"grad_norm": 0.07464248551406584,
"learning_rate": 3.901296322389743e-06,
"loss": 1.2116,
"mean_token_accuracy": 0.7125469863414764,
"num_tokens": 328418711.0,
"step": 3560
},
{
"entropy": 1.20625,
"epoch": 0.4879048790487905,
"grad_norm": 0.07169780004228869,
"learning_rate": 3.897773707200226e-06,
"loss": 1.1914,
"mean_token_accuracy": 0.7174623966217041,
"num_tokens": 329322871.0,
"step": 3570
},
{
"entropy": 1.1828125,
"epoch": 0.48927155938226047,
"grad_norm": 0.072854338611264,
"learning_rate": 3.894251092010709e-06,
"loss": 1.1663,
"mean_token_accuracy": 0.7214651226997375,
"num_tokens": 330272623.0,
"step": 3580
},
{
"entropy": 1.16953125,
"epoch": 0.4906382397157305,
"grad_norm": 0.07878674188927036,
"learning_rate": 3.890728476821192e-06,
"loss": 1.16,
"mean_token_accuracy": 0.7242635607719421,
"num_tokens": 331160163.0,
"step": 3590
},
{
"entropy": 1.13125,
"epoch": 0.4920049200492005,
"grad_norm": 0.0791463721843774,
"learning_rate": 3.8872058616316755e-06,
"loss": 1.1307,
"mean_token_accuracy": 0.7301302373409271,
"num_tokens": 332041350.0,
"step": 3600
},
{
"entropy": 1.15703125,
"epoch": 0.49337160038267047,
"grad_norm": 0.0726635639488206,
"learning_rate": 3.883683246442159e-06,
"loss": 1.1472,
"mean_token_accuracy": 0.725098866224289,
"num_tokens": 332958062.0,
"step": 3610
},
{
"entropy": 1.18828125,
"epoch": 0.4947382807161405,
"grad_norm": 0.07502885365026811,
"learning_rate": 3.8801606312526426e-06,
"loss": 1.1862,
"mean_token_accuracy": 0.7198416411876678,
"num_tokens": 333926373.0,
"step": 3620
},
{
"entropy": 1.0984375,
"epoch": 0.4961049610496105,
"grad_norm": 0.07315696111011304,
"learning_rate": 3.876638016063125e-06,
"loss": 1.0825,
"mean_token_accuracy": 0.7389237880706787,
"num_tokens": 334867149.0,
"step": 3630
},
{
"entropy": 1.1640625,
"epoch": 0.4974716413830805,
"grad_norm": 0.07808120655802787,
"learning_rate": 3.873115400873609e-06,
"loss": 1.1644,
"mean_token_accuracy": 0.7233481228351593,
"num_tokens": 335748006.0,
"step": 3640
},
{
"entropy": 1.18515625,
"epoch": 0.4988383217165505,
"grad_norm": 0.07435680089154029,
"learning_rate": 3.869592785684092e-06,
"loss": 1.1924,
"mean_token_accuracy": 0.719150984287262,
"num_tokens": 336676648.0,
"step": 3650
},
{
"entropy": 1.1875,
"epoch": 0.5002050020500205,
"grad_norm": 0.06300002118389027,
"learning_rate": 3.866070170494575e-06,
"loss": 1.1919,
"mean_token_accuracy": 0.7195121884346009,
"num_tokens": 337574385.0,
"step": 3660
},
{
"entropy": 1.18515625,
"epoch": 0.5015716823834905,
"grad_norm": 0.07705995448547087,
"learning_rate": 3.862547555305059e-06,
"loss": 1.1907,
"mean_token_accuracy": 0.7175062239170075,
"num_tokens": 338532100.0,
"step": 3670
},
{
"entropy": 1.16640625,
"epoch": 0.5029383627169605,
"grad_norm": 0.07885506084084724,
"learning_rate": 3.859024940115542e-06,
"loss": 1.1742,
"mean_token_accuracy": 0.7206377983093262,
"num_tokens": 339464473.0,
"step": 3680
},
{
"entropy": 1.129296875,
"epoch": 0.5043050430504306,
"grad_norm": 0.07261757297427851,
"learning_rate": 3.855502324926026e-06,
"loss": 1.127,
"mean_token_accuracy": 0.731986790895462,
"num_tokens": 340432340.0,
"step": 3690
},
{
"entropy": 1.16875,
"epoch": 0.5056717233839005,
"grad_norm": 0.07419633510804463,
"learning_rate": 3.851979709736508e-06,
"loss": 1.1755,
"mean_token_accuracy": 0.7232293844223022,
"num_tokens": 341335395.0,
"step": 3700
},
{
"entropy": 1.20703125,
"epoch": 0.5070384037173705,
"grad_norm": 0.07277343702596831,
"learning_rate": 3.848457094546992e-06,
"loss": 1.2002,
"mean_token_accuracy": 0.717147272825241,
"num_tokens": 342247450.0,
"step": 3710
},
{
"entropy": 1.15703125,
"epoch": 0.5084050840508405,
"grad_norm": 0.0664506960671912,
"learning_rate": 3.8449344793574755e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.7246916890144348,
"num_tokens": 343198674.0,
"step": 3720
},
{
"entropy": 1.18984375,
"epoch": 0.5097717643843105,
"grad_norm": 0.07063366338892149,
"learning_rate": 3.841411864167959e-06,
"loss": 1.1894,
"mean_token_accuracy": 0.7176162838935852,
"num_tokens": 344106894.0,
"step": 3730
},
{
"entropy": 1.18828125,
"epoch": 0.5111384447177805,
"grad_norm": 0.07275585532265752,
"learning_rate": 3.837889248978442e-06,
"loss": 1.1905,
"mean_token_accuracy": 0.7177943706512451,
"num_tokens": 345010494.0,
"step": 3740
},
{
"entropy": 1.1734375,
"epoch": 0.5125051250512506,
"grad_norm": 0.09216063687946628,
"learning_rate": 3.834366633788925e-06,
"loss": 1.1787,
"mean_token_accuracy": 0.721210914850235,
"num_tokens": 345934020.0,
"step": 3750
},
{
"entropy": 1.19296875,
"epoch": 0.5138718053847205,
"grad_norm": 0.08154843545742471,
"learning_rate": 3.830844018599408e-06,
"loss": 1.1923,
"mean_token_accuracy": 0.7180031120777131,
"num_tokens": 346799355.0,
"step": 3760
},
{
"entropy": 1.20546875,
"epoch": 0.5152384857181905,
"grad_norm": 0.07419565383389785,
"learning_rate": 3.8273214034098915e-06,
"loss": 1.2012,
"mean_token_accuracy": 0.7170530259609222,
"num_tokens": 347721206.0,
"step": 3770
},
{
"entropy": 1.21171875,
"epoch": 0.5166051660516605,
"grad_norm": 0.0682501369659418,
"learning_rate": 3.823798788220375e-06,
"loss": 1.1977,
"mean_token_accuracy": 0.7155344188213348,
"num_tokens": 348616795.0,
"step": 3780
},
{
"entropy": 1.1671875,
"epoch": 0.5179718463851305,
"grad_norm": 0.08227464630560585,
"learning_rate": 3.820276173030859e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.7243692696094512,
"num_tokens": 349508928.0,
"step": 3790
},
{
"entropy": 1.1796875,
"epoch": 0.5193385267186005,
"grad_norm": 0.07111909079508136,
"learning_rate": 3.816753557841341e-06,
"loss": 1.1834,
"mean_token_accuracy": 0.7196440398693085,
"num_tokens": 350404276.0,
"step": 3800
},
{
"entropy": 1.18828125,
"epoch": 0.5207052070520706,
"grad_norm": 0.07556587343244493,
"learning_rate": 3.813230942651825e-06,
"loss": 1.1883,
"mean_token_accuracy": 0.7197266459465027,
"num_tokens": 351273919.0,
"step": 3810
},
{
"entropy": 1.190625,
"epoch": 0.5220718873855406,
"grad_norm": 0.07136508965837707,
"learning_rate": 3.809708327462308e-06,
"loss": 1.1864,
"mean_token_accuracy": 0.7174296200275421,
"num_tokens": 352170178.0,
"step": 3820
},
{
"entropy": 1.1859375,
"epoch": 0.5234385677190105,
"grad_norm": 0.07316177813198317,
"learning_rate": 3.806185712272792e-06,
"loss": 1.1866,
"mean_token_accuracy": 0.7197030186653137,
"num_tokens": 353108363.0,
"step": 3830
},
{
"entropy": 1.25625,
"epoch": 0.5248052480524805,
"grad_norm": 0.07412908074009068,
"learning_rate": 3.802663097083275e-06,
"loss": 1.2691,
"mean_token_accuracy": 0.7025612652301788,
"num_tokens": 353972875.0,
"step": 3840
},
{
"entropy": 1.1671875,
"epoch": 0.5261719283859505,
"grad_norm": 0.25777510506677614,
"learning_rate": 3.799140481893758e-06,
"loss": 1.1613,
"mean_token_accuracy": 0.7231945753097534,
"num_tokens": 354864325.0,
"step": 3850
},
{
"entropy": 1.16171875,
"epoch": 0.5275386087194205,
"grad_norm": 0.07098843486718023,
"learning_rate": 3.7956178667042413e-06,
"loss": 1.1547,
"mean_token_accuracy": 0.7258411765098571,
"num_tokens": 355743210.0,
"step": 3860
},
{
"entropy": 1.132421875,
"epoch": 0.5289052890528906,
"grad_norm": 0.06906039727920894,
"learning_rate": 3.792095251514725e-06,
"loss": 1.1247,
"mean_token_accuracy": 0.728282356262207,
"num_tokens": 356680442.0,
"step": 3870
},
{
"entropy": 1.146875,
"epoch": 0.5302719693863606,
"grad_norm": 0.07314874213112174,
"learning_rate": 3.788572636325208e-06,
"loss": 1.1514,
"mean_token_accuracy": 0.7265850961208343,
"num_tokens": 357633513.0,
"step": 3880
},
{
"entropy": 1.159375,
"epoch": 0.5316386497198305,
"grad_norm": 0.07348075073920245,
"learning_rate": 3.7850500211356916e-06,
"loss": 1.1612,
"mean_token_accuracy": 0.7219974100589752,
"num_tokens": 358560575.0,
"step": 3890
},
{
"entropy": 1.18046875,
"epoch": 0.5330053300533005,
"grad_norm": 0.07955127874086762,
"learning_rate": 3.7815274059461747e-06,
"loss": 1.1731,
"mean_token_accuracy": 0.720606517791748,
"num_tokens": 359474308.0,
"step": 3900
},
{
"entropy": 1.15078125,
"epoch": 0.5343720103867705,
"grad_norm": 0.06748206574082737,
"learning_rate": 3.7780047907566582e-06,
"loss": 1.1629,
"mean_token_accuracy": 0.7215077936649322,
"num_tokens": 360411784.0,
"step": 3910
},
{
"entropy": 1.17265625,
"epoch": 0.5357386907202405,
"grad_norm": 0.0704086335662569,
"learning_rate": 3.7744821755671413e-06,
"loss": 1.1791,
"mean_token_accuracy": 0.7215216815471649,
"num_tokens": 361350524.0,
"step": 3920
},
{
"entropy": 1.17890625,
"epoch": 0.5371053710537106,
"grad_norm": 0.07167212099386118,
"learning_rate": 3.7709595603776245e-06,
"loss": 1.1968,
"mean_token_accuracy": 0.7180787622928619,
"num_tokens": 362283058.0,
"step": 3930
},
{
"entropy": 1.2046875,
"epoch": 0.5384720513871806,
"grad_norm": 0.08391696251764394,
"learning_rate": 3.767436945188108e-06,
"loss": 1.21,
"mean_token_accuracy": 0.714562714099884,
"num_tokens": 363194527.0,
"step": 3940
},
{
"entropy": 1.16953125,
"epoch": 0.5398387317206506,
"grad_norm": 0.07597958581908161,
"learning_rate": 3.7639143299985916e-06,
"loss": 1.1706,
"mean_token_accuracy": 0.7221882939338684,
"num_tokens": 364108504.0,
"step": 3950
},
{
"entropy": 1.21328125,
"epoch": 0.5412054120541205,
"grad_norm": 0.07288042419388469,
"learning_rate": 3.7603917148090747e-06,
"loss": 1.2246,
"mean_token_accuracy": 0.7127920150756836,
"num_tokens": 365029878.0,
"step": 3960
},
{
"entropy": 1.13984375,
"epoch": 0.5425720923875905,
"grad_norm": 0.07163289559232051,
"learning_rate": 3.756869099619558e-06,
"loss": 1.146,
"mean_token_accuracy": 0.7282390773296357,
"num_tokens": 365964480.0,
"step": 3970
},
{
"entropy": 1.1546875,
"epoch": 0.5439387727210605,
"grad_norm": 0.07085330806497302,
"learning_rate": 3.753346484430041e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.7244387567043304,
"num_tokens": 366872780.0,
"step": 3980
},
{
"entropy": 1.1578125,
"epoch": 0.5453054530545306,
"grad_norm": 0.07807500561641706,
"learning_rate": 3.749823869240524e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.726214474439621,
"num_tokens": 367776522.0,
"step": 3990
},
{
"entropy": 1.13515625,
"epoch": 0.5466721333880006,
"grad_norm": 0.07934262166040291,
"learning_rate": 3.746301254051008e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.7296829342842102,
"num_tokens": 368687371.0,
"step": 4000
},
{
"entropy": 1.13671875,
"epoch": 0.5480388137214706,
"grad_norm": 0.07151374452328214,
"learning_rate": 3.742778638861491e-06,
"loss": 1.1394,
"mean_token_accuracy": 0.7266717553138733,
"num_tokens": 369596596.0,
"step": 4010
},
{
"entropy": 1.17421875,
"epoch": 0.5494054940549405,
"grad_norm": 0.07551981178560788,
"learning_rate": 3.7392560236719743e-06,
"loss": 1.1714,
"mean_token_accuracy": 0.7202622532844544,
"num_tokens": 370474928.0,
"step": 4020
},
{
"entropy": 1.11953125,
"epoch": 0.5507721743884105,
"grad_norm": 0.07044768718606362,
"learning_rate": 3.7357334084824574e-06,
"loss": 1.1231,
"mean_token_accuracy": 0.7311617612838746,
"num_tokens": 371384674.0,
"step": 4030
},
{
"entropy": 1.14609375,
"epoch": 0.5521388547218805,
"grad_norm": 0.0766540287985735,
"learning_rate": 3.732210793292941e-06,
"loss": 1.1354,
"mean_token_accuracy": 0.728563392162323,
"num_tokens": 372309541.0,
"step": 4040
},
{
"entropy": 1.2,
"epoch": 0.5535055350553506,
"grad_norm": 0.07527011108879596,
"learning_rate": 3.7286881781034245e-06,
"loss": 1.1996,
"mean_token_accuracy": 0.7162248492240906,
"num_tokens": 373247679.0,
"step": 4050
},
{
"entropy": 1.16796875,
"epoch": 0.5548722153888206,
"grad_norm": 0.0671518386432483,
"learning_rate": 3.7251655629139076e-06,
"loss": 1.1619,
"mean_token_accuracy": 0.7243609607219696,
"num_tokens": 374199142.0,
"step": 4060
},
{
"entropy": 1.2140625,
"epoch": 0.5562388957222906,
"grad_norm": 0.06486351578004718,
"learning_rate": 3.7216429477243907e-06,
"loss": 1.2039,
"mean_token_accuracy": 0.7148890256881714,
"num_tokens": 375123594.0,
"step": 4070
},
{
"entropy": 1.16953125,
"epoch": 0.5576055760557606,
"grad_norm": 0.06888110048801682,
"learning_rate": 3.7181203325348743e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.7232193887233734,
"num_tokens": 376085352.0,
"step": 4080
},
{
"entropy": 1.19609375,
"epoch": 0.5589722563892305,
"grad_norm": 0.07326113762817911,
"learning_rate": 3.7145977173453574e-06,
"loss": 1.1906,
"mean_token_accuracy": 0.7177609145641327,
"num_tokens": 376981305.0,
"step": 4090
},
{
"entropy": 1.159375,
"epoch": 0.5603389367227005,
"grad_norm": 0.06856841074052279,
"learning_rate": 3.7110751021558405e-06,
"loss": 1.1664,
"mean_token_accuracy": 0.7227122545242309,
"num_tokens": 377879583.0,
"step": 4100
},
{
"entropy": 1.15625,
"epoch": 0.5617056170561706,
"grad_norm": 0.08134543747346984,
"learning_rate": 3.7075524869663245e-06,
"loss": 1.1584,
"mean_token_accuracy": 0.7233270883560181,
"num_tokens": 378832645.0,
"step": 4110
},
{
"entropy": 1.17109375,
"epoch": 0.5630722973896406,
"grad_norm": 0.06934229793578206,
"learning_rate": 3.7040298717768076e-06,
"loss": 1.1597,
"mean_token_accuracy": 0.7247578203678131,
"num_tokens": 379779952.0,
"step": 4120
},
{
"entropy": 1.20703125,
"epoch": 0.5644389777231106,
"grad_norm": 0.07523687960466391,
"learning_rate": 3.7005072565872908e-06,
"loss": 1.1946,
"mean_token_accuracy": 0.7167641818523407,
"num_tokens": 380737380.0,
"step": 4130
},
{
"entropy": 1.19453125,
"epoch": 0.5658056580565806,
"grad_norm": 0.07345823129588108,
"learning_rate": 3.696984641397774e-06,
"loss": 1.196,
"mean_token_accuracy": 0.7173387050628662,
"num_tokens": 381656151.0,
"step": 4140
},
{
"entropy": 1.12890625,
"epoch": 0.5671723383900505,
"grad_norm": 0.076733700543983,
"learning_rate": 3.693462026208257e-06,
"loss": 1.1261,
"mean_token_accuracy": 0.7295246183872223,
"num_tokens": 382571650.0,
"step": 4150
},
{
"entropy": 1.17265625,
"epoch": 0.5685390187235205,
"grad_norm": 0.06779213761532582,
"learning_rate": 3.689939411018741e-06,
"loss": 1.1811,
"mean_token_accuracy": 0.72144735455513,
"num_tokens": 383530667.0,
"step": 4160
},
{
"entropy": 1.1203125,
"epoch": 0.5699056990569906,
"grad_norm": 0.07563655716034648,
"learning_rate": 3.686416795829224e-06,
"loss": 1.1077,
"mean_token_accuracy": 0.7330219805240631,
"num_tokens": 384438746.0,
"step": 4170
},
{
"entropy": 1.1640625,
"epoch": 0.5712723793904606,
"grad_norm": 0.07731046255027603,
"learning_rate": 3.6828941806397072e-06,
"loss": 1.1656,
"mean_token_accuracy": 0.7237406849861145,
"num_tokens": 385325961.0,
"step": 4180
},
{
"entropy": 1.15625,
"epoch": 0.5726390597239306,
"grad_norm": 0.07497430465672228,
"learning_rate": 3.6793715654501903e-06,
"loss": 1.1545,
"mean_token_accuracy": 0.7242677867412567,
"num_tokens": 386264764.0,
"step": 4190
},
{
"entropy": 1.17578125,
"epoch": 0.5740057400574006,
"grad_norm": 0.10013350847130958,
"learning_rate": 3.6758489502606735e-06,
"loss": 1.1792,
"mean_token_accuracy": 0.721012556552887,
"num_tokens": 387231562.0,
"step": 4200
},
{
"entropy": 1.1890625,
"epoch": 0.5753724203908706,
"grad_norm": 0.08155908265504527,
"learning_rate": 3.672326335071157e-06,
"loss": 1.1839,
"mean_token_accuracy": 0.7199913740158081,
"num_tokens": 388124752.0,
"step": 4210
},
{
"entropy": 1.175,
"epoch": 0.5767391007243405,
"grad_norm": 0.06784485280180773,
"learning_rate": 3.6688037198816406e-06,
"loss": 1.1812,
"mean_token_accuracy": 0.720984011888504,
"num_tokens": 389034093.0,
"step": 4220
},
{
"entropy": 1.153125,
"epoch": 0.5781057810578106,
"grad_norm": 0.07681952535480933,
"learning_rate": 3.6652811046921237e-06,
"loss": 1.1607,
"mean_token_accuracy": 0.7252882778644562,
"num_tokens": 389920605.0,
"step": 4230
},
{
"entropy": 1.16875,
"epoch": 0.5794724613912806,
"grad_norm": 0.06970753312153345,
"learning_rate": 3.6617584895026072e-06,
"loss": 1.1696,
"mean_token_accuracy": 0.7215573906898498,
"num_tokens": 390802728.0,
"step": 4240
},
{
"entropy": 1.1875,
"epoch": 0.5808391417247506,
"grad_norm": 0.07185189798265716,
"learning_rate": 3.6582358743130904e-06,
"loss": 1.185,
"mean_token_accuracy": 0.7172405362129212,
"num_tokens": 391735027.0,
"step": 4250
},
{
"entropy": 1.19140625,
"epoch": 0.5822058220582206,
"grad_norm": 0.08076535288041209,
"learning_rate": 3.6547132591235735e-06,
"loss": 1.1927,
"mean_token_accuracy": 0.7182661354541778,
"num_tokens": 392621164.0,
"step": 4260
},
{
"entropy": 1.209375,
"epoch": 0.5835725023916906,
"grad_norm": 0.09573190358338328,
"learning_rate": 3.651190643934057e-06,
"loss": 1.222,
"mean_token_accuracy": 0.7134230256080627,
"num_tokens": 393474315.0,
"step": 4270
},
{
"entropy": 1.175,
"epoch": 0.5849391827251605,
"grad_norm": 0.07262599160160904,
"learning_rate": 3.6476680287445406e-06,
"loss": 1.1729,
"mean_token_accuracy": 0.7219851732254028,
"num_tokens": 394416453.0,
"step": 4280
},
{
"entropy": 1.1453125,
"epoch": 0.5863058630586306,
"grad_norm": 0.06972216008160104,
"learning_rate": 3.6441454135550237e-06,
"loss": 1.1356,
"mean_token_accuracy": 0.7291145145893096,
"num_tokens": 395353788.0,
"step": 4290
},
{
"entropy": 1.1796875,
"epoch": 0.5876725433921006,
"grad_norm": 0.07176324563391479,
"learning_rate": 3.640622798365507e-06,
"loss": 1.1841,
"mean_token_accuracy": 0.7185623109340668,
"num_tokens": 396277079.0,
"step": 4300
},
{
"entropy": 1.2109375,
"epoch": 0.5890392237255706,
"grad_norm": 0.06905281723296035,
"learning_rate": 3.63710018317599e-06,
"loss": 1.2068,
"mean_token_accuracy": 0.714428162574768,
"num_tokens": 397204872.0,
"step": 4310
},
{
"entropy": 1.23046875,
"epoch": 0.5904059040590406,
"grad_norm": 0.08125171280130498,
"learning_rate": 3.633577567986473e-06,
"loss": 1.2394,
"mean_token_accuracy": 0.7100616514682769,
"num_tokens": 398106709.0,
"step": 4320
},
{
"entropy": 1.1671875,
"epoch": 0.5917725843925106,
"grad_norm": 0.075860236857997,
"learning_rate": 3.630054952796957e-06,
"loss": 1.1619,
"mean_token_accuracy": 0.7243852615356445,
"num_tokens": 399003959.0,
"step": 4330
},
{
"entropy": 1.21328125,
"epoch": 0.5931392647259806,
"grad_norm": 0.0734082679583243,
"learning_rate": 3.62653233760744e-06,
"loss": 1.2095,
"mean_token_accuracy": 0.7141608238220215,
"num_tokens": 399928261.0,
"step": 4340
},
{
"entropy": 1.17109375,
"epoch": 0.5945059450594506,
"grad_norm": 0.0862203546864696,
"learning_rate": 3.6230097224179233e-06,
"loss": 1.1647,
"mean_token_accuracy": 0.7231438100337982,
"num_tokens": 400734644.0,
"step": 4350
},
{
"entropy": 1.16796875,
"epoch": 0.5958726253929206,
"grad_norm": 0.08283032998117208,
"learning_rate": 3.6194871072284064e-06,
"loss": 1.1759,
"mean_token_accuracy": 0.7217179894447326,
"num_tokens": 401677248.0,
"step": 4360
},
{
"entropy": 1.19375,
"epoch": 0.5972393057263906,
"grad_norm": 0.07298480581570968,
"learning_rate": 3.61596449203889e-06,
"loss": 1.1894,
"mean_token_accuracy": 0.7186711490154266,
"num_tokens": 402589892.0,
"step": 4370
},
{
"entropy": 1.13125,
"epoch": 0.5986059860598606,
"grad_norm": 0.06673223628118874,
"learning_rate": 3.6124418768493735e-06,
"loss": 1.1264,
"mean_token_accuracy": 0.7304266929626465,
"num_tokens": 403493521.0,
"step": 4380
},
{
"entropy": 1.16484375,
"epoch": 0.5999726663933306,
"grad_norm": 0.0699852826598973,
"learning_rate": 3.6089192616598566e-06,
"loss": 1.171,
"mean_token_accuracy": 0.7232425749301911,
"num_tokens": 404481917.0,
"step": 4390
},
{
"entropy": 1.16875,
"epoch": 0.6013393467268006,
"grad_norm": 0.07402442252742719,
"learning_rate": 3.6053966464703398e-06,
"loss": 1.1771,
"mean_token_accuracy": 0.7203011810779572,
"num_tokens": 405381103.0,
"step": 4400
},
{
"entropy": 1.21171875,
"epoch": 0.6027060270602707,
"grad_norm": 0.10157549535601608,
"learning_rate": 3.6018740312808233e-06,
"loss": 1.2193,
"mean_token_accuracy": 0.7120676636695862,
"num_tokens": 406360551.0,
"step": 4410
},
{
"entropy": 1.1640625,
"epoch": 0.6040727073937406,
"grad_norm": 0.07636527604139738,
"learning_rate": 3.5983514160913064e-06,
"loss": 1.1709,
"mean_token_accuracy": 0.7201756119728089,
"num_tokens": 407306942.0,
"step": 4420
},
{
"entropy": 1.165625,
"epoch": 0.6054393877272106,
"grad_norm": 0.07114332135498826,
"learning_rate": 3.5948288009017895e-06,
"loss": 1.1619,
"mean_token_accuracy": 0.723280155658722,
"num_tokens": 408238455.0,
"step": 4430
},
{
"entropy": 1.1890625,
"epoch": 0.6068060680606806,
"grad_norm": 0.0783186343185649,
"learning_rate": 3.591306185712273e-06,
"loss": 1.1856,
"mean_token_accuracy": 0.7190285086631775,
"num_tokens": 409152396.0,
"step": 4440
},
{
"entropy": 1.1984375,
"epoch": 0.6081727483941506,
"grad_norm": 0.06764731554860437,
"learning_rate": 3.5877835705227566e-06,
"loss": 1.1941,
"mean_token_accuracy": 0.7173039495944977,
"num_tokens": 410053504.0,
"step": 4450
},
{
"entropy": 1.1234375,
"epoch": 0.6095394287276206,
"grad_norm": 0.11923504897129421,
"learning_rate": 3.5842609553332398e-06,
"loss": 1.1215,
"mean_token_accuracy": 0.7310825288295746,
"num_tokens": 410972109.0,
"step": 4460
},
{
"entropy": 1.1703125,
"epoch": 0.6109061090610907,
"grad_norm": 0.07635934493292273,
"learning_rate": 3.580738340143723e-06,
"loss": 1.1704,
"mean_token_accuracy": 0.7216120719909668,
"num_tokens": 411889574.0,
"step": 4470
},
{
"entropy": 1.16875,
"epoch": 0.6122727893945606,
"grad_norm": 0.07582397472743163,
"learning_rate": 3.577215724954206e-06,
"loss": 1.1733,
"mean_token_accuracy": 0.7230900406837464,
"num_tokens": 412842473.0,
"step": 4480
},
{
"entropy": 1.20859375,
"epoch": 0.6136394697280306,
"grad_norm": 0.07273069844046441,
"learning_rate": 3.57369310976469e-06,
"loss": 1.2153,
"mean_token_accuracy": 0.7132490575313568,
"num_tokens": 413720887.0,
"step": 4490
},
{
"entropy": 1.20078125,
"epoch": 0.6150061500615006,
"grad_norm": 0.07094392823789729,
"learning_rate": 3.570170494575173e-06,
"loss": 1.1973,
"mean_token_accuracy": 0.7201809167861939,
"num_tokens": 414663481.0,
"step": 4500
},
{
"entropy": 1.2390625,
"epoch": 0.6163728303949706,
"grad_norm": 0.07743980296695,
"learning_rate": 3.5666478793856562e-06,
"loss": 1.2559,
"mean_token_accuracy": 0.7059875130653381,
"num_tokens": 415566128.0,
"step": 4510
},
{
"entropy": 1.148828125,
"epoch": 0.6177395107284406,
"grad_norm": 0.07493676555427585,
"learning_rate": 3.5631252641961394e-06,
"loss": 1.1473,
"mean_token_accuracy": 0.7260205447673798,
"num_tokens": 416455167.0,
"step": 4520
},
{
"entropy": 1.109375,
"epoch": 0.6191061910619107,
"grad_norm": 0.07369250966393795,
"learning_rate": 3.5596026490066225e-06,
"loss": 1.1134,
"mean_token_accuracy": 0.7322049021720887,
"num_tokens": 417403231.0,
"step": 4530
},
{
"entropy": 1.14921875,
"epoch": 0.6204728713953807,
"grad_norm": 0.07664927732866114,
"learning_rate": 3.556080033817106e-06,
"loss": 1.1629,
"mean_token_accuracy": 0.7241127490997314,
"num_tokens": 418291523.0,
"step": 4540
},
{
"entropy": 1.21171875,
"epoch": 0.6218395517288506,
"grad_norm": 0.07158077940070166,
"learning_rate": 3.5525574186275896e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.7151435375213623,
"num_tokens": 419177494.0,
"step": 4550
},
{
"entropy": 1.184375,
"epoch": 0.6232062320623206,
"grad_norm": 0.07673594239176992,
"learning_rate": 3.5490348034380727e-06,
"loss": 1.1948,
"mean_token_accuracy": 0.7179465353488922,
"num_tokens": 420118197.0,
"step": 4560
},
{
"entropy": 1.1765625,
"epoch": 0.6245729123957906,
"grad_norm": 0.07205761044057808,
"learning_rate": 3.545512188248556e-06,
"loss": 1.179,
"mean_token_accuracy": 0.7194531917572021,
"num_tokens": 421017126.0,
"step": 4570
},
{
"entropy": 1.1421875,
"epoch": 0.6259395927292606,
"grad_norm": 0.07730891095322362,
"learning_rate": 3.5419895730590394e-06,
"loss": 1.1465,
"mean_token_accuracy": 0.7251220226287842,
"num_tokens": 421902299.0,
"step": 4580
},
{
"entropy": 1.21171875,
"epoch": 0.6273062730627307,
"grad_norm": 0.07280680029317045,
"learning_rate": 3.5384669578695225e-06,
"loss": 1.2124,
"mean_token_accuracy": 0.7149730563163758,
"num_tokens": 422810394.0,
"step": 4590
},
{
"entropy": 1.20625,
"epoch": 0.6286729533962007,
"grad_norm": 0.0722022721799792,
"learning_rate": 3.534944342680006e-06,
"loss": 1.2022,
"mean_token_accuracy": 0.7174553334712982,
"num_tokens": 423762943.0,
"step": 4600
},
{
"entropy": 1.171875,
"epoch": 0.6300396337296706,
"grad_norm": 0.07048884190790664,
"learning_rate": 3.5314217274904896e-06,
"loss": 1.177,
"mean_token_accuracy": 0.7209394097328186,
"num_tokens": 424688079.0,
"step": 4610
},
{
"entropy": 1.18359375,
"epoch": 0.6314063140631406,
"grad_norm": 0.0741619600342552,
"learning_rate": 3.5278991123009727e-06,
"loss": 1.1905,
"mean_token_accuracy": 0.7192073464393616,
"num_tokens": 425593816.0,
"step": 4620
},
{
"entropy": 1.1984375,
"epoch": 0.6327729943966106,
"grad_norm": 0.08297342491628795,
"learning_rate": 3.524376497111456e-06,
"loss": 1.1934,
"mean_token_accuracy": 0.7180655419826507,
"num_tokens": 426535659.0,
"step": 4630
},
{
"entropy": 1.1625,
"epoch": 0.6341396747300806,
"grad_norm": 0.08016055368798893,
"learning_rate": 3.520853881921939e-06,
"loss": 1.1575,
"mean_token_accuracy": 0.7236984014511109,
"num_tokens": 427448468.0,
"step": 4640
},
{
"entropy": 1.1640625,
"epoch": 0.6355063550635507,
"grad_norm": 0.07246833254138692,
"learning_rate": 3.517331266732422e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7245487749576569,
"num_tokens": 428404261.0,
"step": 4650
},
{
"entropy": 1.23515625,
"epoch": 0.6368730353970207,
"grad_norm": 0.07344138676672263,
"learning_rate": 3.513808651542906e-06,
"loss": 1.2421,
"mean_token_accuracy": 0.7103794276714325,
"num_tokens": 429314923.0,
"step": 4660
},
{
"entropy": 1.16796875,
"epoch": 0.6382397157304907,
"grad_norm": 0.08060263089302534,
"learning_rate": 3.510286036353389e-06,
"loss": 1.1603,
"mean_token_accuracy": 0.7229906678199768,
"num_tokens": 430177549.0,
"step": 4670
},
{
"entropy": 1.16015625,
"epoch": 0.6396063960639606,
"grad_norm": 0.06522192667390764,
"learning_rate": 3.5067634211638723e-06,
"loss": 1.1642,
"mean_token_accuracy": 0.723647290468216,
"num_tokens": 431135284.0,
"step": 4680
},
{
"entropy": 1.17265625,
"epoch": 0.6409730763974306,
"grad_norm": 0.07295150393131267,
"learning_rate": 3.5032408059743554e-06,
"loss": 1.1774,
"mean_token_accuracy": 0.7210940599441529,
"num_tokens": 432065066.0,
"step": 4690
},
{
"entropy": 1.16328125,
"epoch": 0.6423397567309006,
"grad_norm": 0.07432338761434398,
"learning_rate": 3.499718190784839e-06,
"loss": 1.1616,
"mean_token_accuracy": 0.7221442461013794,
"num_tokens": 432943463.0,
"step": 4700
},
{
"entropy": 1.15859375,
"epoch": 0.6437064370643707,
"grad_norm": 0.07296207962118226,
"learning_rate": 3.4961955755953225e-06,
"loss": 1.1569,
"mean_token_accuracy": 0.7239503502845764,
"num_tokens": 433869594.0,
"step": 4710
},
{
"entropy": 1.19453125,
"epoch": 0.6450731173978407,
"grad_norm": 0.07025683570778858,
"learning_rate": 3.4926729604058056e-06,
"loss": 1.195,
"mean_token_accuracy": 0.7163998186588287,
"num_tokens": 434779798.0,
"step": 4720
},
{
"entropy": 1.159375,
"epoch": 0.6464397977313107,
"grad_norm": 0.07782367330119261,
"learning_rate": 3.4891503452162888e-06,
"loss": 1.1471,
"mean_token_accuracy": 0.7266017854213714,
"num_tokens": 435652803.0,
"step": 4730
},
{
"entropy": 1.14609375,
"epoch": 0.6478064780647806,
"grad_norm": 0.08795796364314289,
"learning_rate": 3.4856277300267723e-06,
"loss": 1.1469,
"mean_token_accuracy": 0.7256891608238221,
"num_tokens": 436545226.0,
"step": 4740
},
{
"entropy": 1.18515625,
"epoch": 0.6491731583982506,
"grad_norm": 0.07921112516118607,
"learning_rate": 3.4821051148372554e-06,
"loss": 1.1908,
"mean_token_accuracy": 0.7182975649833679,
"num_tokens": 437438317.0,
"step": 4750
},
{
"entropy": 1.1203125,
"epoch": 0.6505398387317206,
"grad_norm": 0.08393281570834064,
"learning_rate": 3.4785824996477386e-06,
"loss": 1.1002,
"mean_token_accuracy": 0.7346743047237396,
"num_tokens": 438316880.0,
"step": 4760
},
{
"entropy": 1.10625,
"epoch": 0.6519065190651907,
"grad_norm": 0.06894704783429643,
"learning_rate": 3.475059884458222e-06,
"loss": 1.1138,
"mean_token_accuracy": 0.7330049216747284,
"num_tokens": 439224474.0,
"step": 4770
},
{
"entropy": 1.1484375,
"epoch": 0.6532731993986607,
"grad_norm": 0.07414715473910768,
"learning_rate": 3.4715372692687057e-06,
"loss": 1.152,
"mean_token_accuracy": 0.7260858654975891,
"num_tokens": 440100963.0,
"step": 4780
},
{
"entropy": 1.2078125,
"epoch": 0.6546398797321307,
"grad_norm": 0.07507392814583182,
"learning_rate": 3.4680146540791888e-06,
"loss": 1.2171,
"mean_token_accuracy": 0.713755077123642,
"num_tokens": 441037518.0,
"step": 4790
},
{
"entropy": 1.175,
"epoch": 0.6560065600656007,
"grad_norm": 0.07886132929375848,
"learning_rate": 3.464492038889672e-06,
"loss": 1.1673,
"mean_token_accuracy": 0.7231556475162506,
"num_tokens": 441908052.0,
"step": 4800
},
{
"entropy": 1.18359375,
"epoch": 0.6573732403990706,
"grad_norm": 0.07289048754811829,
"learning_rate": 3.460969423700155e-06,
"loss": 1.191,
"mean_token_accuracy": 0.7171859920024872,
"num_tokens": 442877731.0,
"step": 4810
},
{
"entropy": 1.17109375,
"epoch": 0.6587399207325406,
"grad_norm": 0.07416248828012428,
"learning_rate": 3.457446808510639e-06,
"loss": 1.1648,
"mean_token_accuracy": 0.7245825707912446,
"num_tokens": 443815286.0,
"step": 4820
},
{
"entropy": 1.14296875,
"epoch": 0.6601066010660107,
"grad_norm": 0.07411615928576987,
"learning_rate": 3.453924193321122e-06,
"loss": 1.1562,
"mean_token_accuracy": 0.7233711838722229,
"num_tokens": 444773427.0,
"step": 4830
},
{
"entropy": 1.133984375,
"epoch": 0.6614732813994807,
"grad_norm": 0.07641923378802905,
"learning_rate": 3.4504015781316052e-06,
"loss": 1.124,
"mean_token_accuracy": 0.7312997221946717,
"num_tokens": 445718805.0,
"step": 4840
},
{
"entropy": 1.175,
"epoch": 0.6628399617329507,
"grad_norm": 0.07182962864386198,
"learning_rate": 3.4468789629420884e-06,
"loss": 1.189,
"mean_token_accuracy": 0.71767498254776,
"num_tokens": 446702383.0,
"step": 4850
},
{
"entropy": 1.112890625,
"epoch": 0.6642066420664207,
"grad_norm": 0.07763800599466689,
"learning_rate": 3.4433563477525715e-06,
"loss": 1.1221,
"mean_token_accuracy": 0.7307863712310791,
"num_tokens": 447616023.0,
"step": 4860
},
{
"entropy": 1.165625,
"epoch": 0.6655733223998906,
"grad_norm": 0.07416290673934596,
"learning_rate": 3.439833732563055e-06,
"loss": 1.1638,
"mean_token_accuracy": 0.7236368179321289,
"num_tokens": 448525850.0,
"step": 4870
},
{
"entropy": 1.15703125,
"epoch": 0.6669400027333606,
"grad_norm": 0.07198458749029335,
"learning_rate": 3.4363111173735386e-06,
"loss": 1.16,
"mean_token_accuracy": 0.7241340219974518,
"num_tokens": 449488499.0,
"step": 4880
},
{
"entropy": 1.16875,
"epoch": 0.6683066830668307,
"grad_norm": 0.07133597001283783,
"learning_rate": 3.4327885021840217e-06,
"loss": 1.1716,
"mean_token_accuracy": 0.7219540655612946,
"num_tokens": 450427887.0,
"step": 4890
},
{
"entropy": 1.1609375,
"epoch": 0.6696733634003007,
"grad_norm": 0.07578261162009403,
"learning_rate": 3.429265886994505e-06,
"loss": 1.1653,
"mean_token_accuracy": 0.72403564453125,
"num_tokens": 451388079.0,
"step": 4900
},
{
"entropy": 1.16875,
"epoch": 0.6710400437337707,
"grad_norm": 0.07264322968816501,
"learning_rate": 3.4257432718049884e-06,
"loss": 1.1644,
"mean_token_accuracy": 0.7225780189037323,
"num_tokens": 452325548.0,
"step": 4910
},
{
"entropy": 1.138671875,
"epoch": 0.6724067240672407,
"grad_norm": 0.07038760907482677,
"learning_rate": 3.4222206566154715e-06,
"loss": 1.1523,
"mean_token_accuracy": 0.7257314324378967,
"num_tokens": 453283517.0,
"step": 4920
},
{
"entropy": 1.1765625,
"epoch": 0.6737734044007107,
"grad_norm": 0.07029099062973922,
"learning_rate": 3.418698041425955e-06,
"loss": 1.1737,
"mean_token_accuracy": 0.7199459850788117,
"num_tokens": 454196973.0,
"step": 4930
},
{
"entropy": 1.126953125,
"epoch": 0.6751400847341806,
"grad_norm": 0.06973263362372017,
"learning_rate": 3.415175426236438e-06,
"loss": 1.1226,
"mean_token_accuracy": 0.7304257094860077,
"num_tokens": 455141874.0,
"step": 4940
},
{
"entropy": 1.1,
"epoch": 0.6765067650676507,
"grad_norm": 0.07102396103019286,
"learning_rate": 3.4116528110469217e-06,
"loss": 1.0967,
"mean_token_accuracy": 0.7366045773029327,
"num_tokens": 456058190.0,
"step": 4950
},
{
"entropy": 1.15625,
"epoch": 0.6778734454011207,
"grad_norm": 0.07722175127704248,
"learning_rate": 3.408130195857405e-06,
"loss": 1.1594,
"mean_token_accuracy": 0.724379163980484,
"num_tokens": 456928962.0,
"step": 4960
},
{
"entropy": 1.1890625,
"epoch": 0.6792401257345907,
"grad_norm": 0.07408648818575864,
"learning_rate": 3.404607580667888e-06,
"loss": 1.1845,
"mean_token_accuracy": 0.7197899937629699,
"num_tokens": 457856361.0,
"step": 4970
},
{
"entropy": 1.13359375,
"epoch": 0.6806068060680607,
"grad_norm": 0.07669975659261676,
"learning_rate": 3.401084965478371e-06,
"loss": 1.1343,
"mean_token_accuracy": 0.730760133266449,
"num_tokens": 458756840.0,
"step": 4980
},
{
"entropy": 1.17265625,
"epoch": 0.6819734864015307,
"grad_norm": 0.07704443685607931,
"learning_rate": 3.397562350288855e-06,
"loss": 1.1758,
"mean_token_accuracy": 0.7219862163066864,
"num_tokens": 459682905.0,
"step": 4990
},
{
"entropy": 1.13515625,
"epoch": 0.6833401667350006,
"grad_norm": 0.0724560731654229,
"learning_rate": 3.394039735099338e-06,
"loss": 1.136,
"mean_token_accuracy": 0.7286250591278076,
"num_tokens": 460604101.0,
"step": 5000
},
{
"entropy": 1.121875,
"epoch": 0.6847068470684707,
"grad_norm": 0.07559422977498603,
"learning_rate": 3.3905171199098213e-06,
"loss": 1.1308,
"mean_token_accuracy": 0.7281251728534699,
"num_tokens": 461512991.0,
"step": 5010
},
{
"entropy": 1.14296875,
"epoch": 0.6860735274019407,
"grad_norm": 0.07105451608231264,
"learning_rate": 3.3869945047203044e-06,
"loss": 1.15,
"mean_token_accuracy": 0.725556367635727,
"num_tokens": 462454053.0,
"step": 5020
},
{
"entropy": 1.1390625,
"epoch": 0.6874402077354107,
"grad_norm": 0.07778381395270549,
"learning_rate": 3.3834718895307876e-06,
"loss": 1.1309,
"mean_token_accuracy": 0.7288597702980042,
"num_tokens": 463360778.0,
"step": 5030
},
{
"entropy": 1.140625,
"epoch": 0.6888068880688807,
"grad_norm": 0.06848437033839444,
"learning_rate": 3.3799492743412715e-06,
"loss": 1.148,
"mean_token_accuracy": 0.7261955440044403,
"num_tokens": 464284769.0,
"step": 5040
},
{
"entropy": 1.1421875,
"epoch": 0.6901735684023507,
"grad_norm": 0.06858224781893939,
"learning_rate": 3.3764266591517547e-06,
"loss": 1.1425,
"mean_token_accuracy": 0.7274977803230286,
"num_tokens": 465220861.0,
"step": 5050
},
{
"entropy": 1.14375,
"epoch": 0.6915402487358207,
"grad_norm": 0.07064713590227573,
"learning_rate": 3.3729040439622378e-06,
"loss": 1.1374,
"mean_token_accuracy": 0.72767773270607,
"num_tokens": 466152597.0,
"step": 5060
},
{
"entropy": 1.16796875,
"epoch": 0.6929069290692907,
"grad_norm": 0.08212364147127212,
"learning_rate": 3.3693814287727213e-06,
"loss": 1.16,
"mean_token_accuracy": 0.7228836417198181,
"num_tokens": 467052088.0,
"step": 5070
},
{
"entropy": 1.196875,
"epoch": 0.6942736094027607,
"grad_norm": 0.07379480147597539,
"learning_rate": 3.3658588135832044e-06,
"loss": 1.2026,
"mean_token_accuracy": 0.7169423818588256,
"num_tokens": 468034958.0,
"step": 5080
},
{
"entropy": 1.13671875,
"epoch": 0.6956402897362307,
"grad_norm": 0.08366881943609672,
"learning_rate": 3.3623361983936876e-06,
"loss": 1.1398,
"mean_token_accuracy": 0.7288046181201935,
"num_tokens": 469021348.0,
"step": 5090
},
{
"entropy": 1.16875,
"epoch": 0.6970069700697007,
"grad_norm": 0.07465894684024824,
"learning_rate": 3.358813583204171e-06,
"loss": 1.1711,
"mean_token_accuracy": 0.7216868102550507,
"num_tokens": 469996550.0,
"step": 5100
},
{
"entropy": 1.221875,
"epoch": 0.6983736504031707,
"grad_norm": 0.07872768649203365,
"learning_rate": 3.3552909680146547e-06,
"loss": 1.2149,
"mean_token_accuracy": 0.7137858450412751,
"num_tokens": 470920985.0,
"step": 5110
},
{
"entropy": 1.13828125,
"epoch": 0.6997403307366407,
"grad_norm": 0.07504599342193499,
"learning_rate": 3.3517683528251378e-06,
"loss": 1.1381,
"mean_token_accuracy": 0.7276299059391022,
"num_tokens": 471845765.0,
"step": 5120
},
{
"entropy": 1.1359375,
"epoch": 0.7011070110701108,
"grad_norm": 0.08967516432609982,
"learning_rate": 3.348245737635621e-06,
"loss": 1.1294,
"mean_token_accuracy": 0.729719090461731,
"num_tokens": 472791415.0,
"step": 5130
},
{
"entropy": 1.14140625,
"epoch": 0.7024736914035807,
"grad_norm": 0.06795863223913523,
"learning_rate": 3.344723122446104e-06,
"loss": 1.1316,
"mean_token_accuracy": 0.7280956268310547,
"num_tokens": 473685911.0,
"step": 5140
},
{
"entropy": 1.18203125,
"epoch": 0.7038403717370507,
"grad_norm": 0.07366638903401379,
"learning_rate": 3.341200507256587e-06,
"loss": 1.183,
"mean_token_accuracy": 0.7210059463977814,
"num_tokens": 474598701.0,
"step": 5150
},
{
"entropy": 1.2046875,
"epoch": 0.7052070520705207,
"grad_norm": 0.07981000937283804,
"learning_rate": 3.337677892067071e-06,
"loss": 1.2033,
"mean_token_accuracy": 0.7175868272781372,
"num_tokens": 475505552.0,
"step": 5160
},
{
"entropy": 1.16953125,
"epoch": 0.7065737324039907,
"grad_norm": 0.07753514095210978,
"learning_rate": 3.3341552768775543e-06,
"loss": 1.1721,
"mean_token_accuracy": 0.7197802186012268,
"num_tokens": 476440433.0,
"step": 5170
},
{
"entropy": 1.154296875,
"epoch": 0.7079404127374607,
"grad_norm": 0.07248107159920317,
"learning_rate": 3.3306326616880374e-06,
"loss": 1.1643,
"mean_token_accuracy": 0.7232057213783264,
"num_tokens": 477364926.0,
"step": 5180
},
{
"entropy": 1.18359375,
"epoch": 0.7093070930709308,
"grad_norm": 0.06911149868387619,
"learning_rate": 3.3271100464985205e-06,
"loss": 1.174,
"mean_token_accuracy": 0.721264660358429,
"num_tokens": 478303729.0,
"step": 5190
},
{
"entropy": 1.153125,
"epoch": 0.7106737734044007,
"grad_norm": 0.07555384058901875,
"learning_rate": 3.323587431309004e-06,
"loss": 1.1521,
"mean_token_accuracy": 0.7270874559879303,
"num_tokens": 479265206.0,
"step": 5200
},
{
"entropy": 1.1734375,
"epoch": 0.7120404537378707,
"grad_norm": 0.0693269891142089,
"learning_rate": 3.3200648161194876e-06,
"loss": 1.1611,
"mean_token_accuracy": 0.7225268661975861,
"num_tokens": 480196185.0,
"step": 5210
},
{
"entropy": 1.1484375,
"epoch": 0.7134071340713407,
"grad_norm": 0.07699185736863115,
"learning_rate": 3.3165422009299707e-06,
"loss": 1.1529,
"mean_token_accuracy": 0.7248894155025483,
"num_tokens": 481119022.0,
"step": 5220
},
{
"entropy": 1.1890625,
"epoch": 0.7147738144048107,
"grad_norm": 0.07233861670957067,
"learning_rate": 3.313019585740454e-06,
"loss": 1.188,
"mean_token_accuracy": 0.7185642957687378,
"num_tokens": 482034528.0,
"step": 5230
},
{
"entropy": 1.17890625,
"epoch": 0.7161404947382807,
"grad_norm": 0.07591032976339392,
"learning_rate": 3.3094969705509374e-06,
"loss": 1.1819,
"mean_token_accuracy": 0.7194223403930664,
"num_tokens": 482963072.0,
"step": 5240
},
{
"entropy": 1.15546875,
"epoch": 0.7175071750717508,
"grad_norm": 0.07005302715313529,
"learning_rate": 3.3059743553614205e-06,
"loss": 1.1689,
"mean_token_accuracy": 0.7225077271461486,
"num_tokens": 483869466.0,
"step": 5250
},
{
"entropy": 1.1375,
"epoch": 0.7188738554052208,
"grad_norm": 0.07513835860271503,
"learning_rate": 3.3024517401719036e-06,
"loss": 1.1342,
"mean_token_accuracy": 0.7272518694400787,
"num_tokens": 484772219.0,
"step": 5260
},
{
"entropy": 1.16875,
"epoch": 0.7202405357386907,
"grad_norm": 0.08009362940151488,
"learning_rate": 3.298929124982387e-06,
"loss": 1.1869,
"mean_token_accuracy": 0.723358017206192,
"num_tokens": 485705049.0,
"step": 5270
},
{
"entropy": 1.2,
"epoch": 0.7216072160721607,
"grad_norm": 0.06978440593865189,
"learning_rate": 3.2954065097928707e-06,
"loss": 1.2105,
"mean_token_accuracy": 0.7162765324115753,
"num_tokens": 486647525.0,
"step": 5280
},
{
"entropy": 1.1515625,
"epoch": 0.7229738964056307,
"grad_norm": 0.07211135028134717,
"learning_rate": 3.291883894603354e-06,
"loss": 1.1559,
"mean_token_accuracy": 0.7259354710578918,
"num_tokens": 487571730.0,
"step": 5290
},
{
"entropy": 1.1109375,
"epoch": 0.7243405767391007,
"grad_norm": 0.0777178023379035,
"learning_rate": 3.288361279413837e-06,
"loss": 1.1059,
"mean_token_accuracy": 0.7320446014404297,
"num_tokens": 488486162.0,
"step": 5300
},
{
"entropy": 1.12890625,
"epoch": 0.7257072570725708,
"grad_norm": 0.06893017403680225,
"learning_rate": 3.28483866422432e-06,
"loss": 1.1254,
"mean_token_accuracy": 0.7306069910526276,
"num_tokens": 489397994.0,
"step": 5310
},
{
"entropy": 1.1671875,
"epoch": 0.7270739374060408,
"grad_norm": 0.0684759609530092,
"learning_rate": 3.281316049034804e-06,
"loss": 1.1731,
"mean_token_accuracy": 0.7204772233963013,
"num_tokens": 490378487.0,
"step": 5320
},
{
"entropy": 1.1375,
"epoch": 0.7284406177395107,
"grad_norm": 0.07113786055489268,
"learning_rate": 3.277793433845287e-06,
"loss": 1.1268,
"mean_token_accuracy": 0.7275528192520142,
"num_tokens": 491314657.0,
"step": 5330
},
{
"entropy": 1.1671875,
"epoch": 0.7298072980729807,
"grad_norm": 0.07507900249807559,
"learning_rate": 3.2742708186557703e-06,
"loss": 1.1732,
"mean_token_accuracy": 0.7235531210899353,
"num_tokens": 492253180.0,
"step": 5340
},
{
"entropy": 1.16875,
"epoch": 0.7311739784064507,
"grad_norm": 0.08008629620270061,
"learning_rate": 3.2707482034662534e-06,
"loss": 1.1638,
"mean_token_accuracy": 0.7215545952320099,
"num_tokens": 493185858.0,
"step": 5350
},
{
"entropy": 1.1671875,
"epoch": 0.7325406587399207,
"grad_norm": 0.07813135627478648,
"learning_rate": 3.2672255882767366e-06,
"loss": 1.1598,
"mean_token_accuracy": 0.726144152879715,
"num_tokens": 494110299.0,
"step": 5360
},
{
"entropy": 1.121875,
"epoch": 0.7339073390733908,
"grad_norm": 0.06548925281469259,
"learning_rate": 3.26370297308722e-06,
"loss": 1.1232,
"mean_token_accuracy": 0.7310143530368804,
"num_tokens": 495063936.0,
"step": 5370
},
{
"entropy": 1.15234375,
"epoch": 0.7352740194068608,
"grad_norm": 0.07211521685930017,
"learning_rate": 3.2601803578977037e-06,
"loss": 1.1531,
"mean_token_accuracy": 0.7239041090011596,
"num_tokens": 495982933.0,
"step": 5380
},
{
"entropy": 1.1375,
"epoch": 0.7366406997403308,
"grad_norm": 0.07500258548202877,
"learning_rate": 3.2566577427081868e-06,
"loss": 1.1426,
"mean_token_accuracy": 0.7272853493690491,
"num_tokens": 496933673.0,
"step": 5390
},
{
"entropy": 1.178125,
"epoch": 0.7380073800738007,
"grad_norm": 0.07581596193006869,
"learning_rate": 3.25313512751867e-06,
"loss": 1.1875,
"mean_token_accuracy": 0.720012241601944,
"num_tokens": 497833208.0,
"step": 5400
},
{
"entropy": 1.1859375,
"epoch": 0.7393740604072707,
"grad_norm": 0.08026576484183154,
"learning_rate": 3.2496125123291535e-06,
"loss": 1.1942,
"mean_token_accuracy": 0.7173934817314148,
"num_tokens": 498759050.0,
"step": 5410
},
{
"entropy": 1.14765625,
"epoch": 0.7407407407407407,
"grad_norm": 0.07559436102138549,
"learning_rate": 3.2460898971396366e-06,
"loss": 1.1425,
"mean_token_accuracy": 0.7287674665451049,
"num_tokens": 499683263.0,
"step": 5420
},
{
"entropy": 1.169140625,
"epoch": 0.7421074210742108,
"grad_norm": 0.07439910104022722,
"learning_rate": 3.24256728195012e-06,
"loss": 1.1741,
"mean_token_accuracy": 0.7208215832710266,
"num_tokens": 500587418.0,
"step": 5430
},
{
"entropy": 1.13828125,
"epoch": 0.7434741014076808,
"grad_norm": 0.0713498630816485,
"learning_rate": 3.2390446667606037e-06,
"loss": 1.1488,
"mean_token_accuracy": 0.7255042672157288,
"num_tokens": 501538593.0,
"step": 5440
},
{
"entropy": 1.1546875,
"epoch": 0.7448407817411508,
"grad_norm": 0.07155620532024758,
"learning_rate": 3.235522051571087e-06,
"loss": 1.1554,
"mean_token_accuracy": 0.7263371407985687,
"num_tokens": 502458567.0,
"step": 5450
},
{
"entropy": 1.16953125,
"epoch": 0.7462074620746207,
"grad_norm": 0.07208361728980775,
"learning_rate": 3.23199943638157e-06,
"loss": 1.1739,
"mean_token_accuracy": 0.7220803141593933,
"num_tokens": 503343942.0,
"step": 5460
},
{
"entropy": 1.1796875,
"epoch": 0.7475741424080907,
"grad_norm": 0.0804058432455138,
"learning_rate": 3.228476821192053e-06,
"loss": 1.1964,
"mean_token_accuracy": 0.7171915113925934,
"num_tokens": 504273657.0,
"step": 5470
},
{
"entropy": 1.19453125,
"epoch": 0.7489408227415607,
"grad_norm": 0.07368995786385579,
"learning_rate": 3.224954206002536e-06,
"loss": 1.2028,
"mean_token_accuracy": 0.716469818353653,
"num_tokens": 505223492.0,
"step": 5480
},
{
"entropy": 1.12890625,
"epoch": 0.7503075030750308,
"grad_norm": 0.0765876405006956,
"learning_rate": 3.22143159081302e-06,
"loss": 1.1275,
"mean_token_accuracy": 0.7304972350597382,
"num_tokens": 506149685.0,
"step": 5490
},
{
"entropy": 1.17734375,
"epoch": 0.7516741834085008,
"grad_norm": 0.07626235499393226,
"learning_rate": 3.2179089756235033e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.7209382593631745,
"num_tokens": 507071941.0,
"step": 5500
},
{
"entropy": 1.1875,
"epoch": 0.7530408637419708,
"grad_norm": 0.0733160471794245,
"learning_rate": 3.2143863604339864e-06,
"loss": 1.1822,
"mean_token_accuracy": 0.7203241944313049,
"num_tokens": 508013836.0,
"step": 5510
},
{
"entropy": 1.183984375,
"epoch": 0.7544075440754408,
"grad_norm": 0.07503815611435222,
"learning_rate": 3.2108637452444695e-06,
"loss": 1.1857,
"mean_token_accuracy": 0.7205870628356934,
"num_tokens": 508928276.0,
"step": 5520
},
{
"entropy": 1.209375,
"epoch": 0.7557742244089107,
"grad_norm": 0.07708839733378883,
"learning_rate": 3.2073411300549526e-06,
"loss": 1.21,
"mean_token_accuracy": 0.7155376076698303,
"num_tokens": 509829624.0,
"step": 5530
},
{
"entropy": 1.106640625,
"epoch": 0.7571409047423807,
"grad_norm": 0.07667435140505516,
"learning_rate": 3.2038185148654366e-06,
"loss": 1.1044,
"mean_token_accuracy": 0.7349287390708923,
"num_tokens": 510741617.0,
"step": 5540
},
{
"entropy": 1.1890625,
"epoch": 0.7585075850758508,
"grad_norm": 0.07094705382167878,
"learning_rate": 3.2002958996759197e-06,
"loss": 1.1962,
"mean_token_accuracy": 0.7183643937110901,
"num_tokens": 511633875.0,
"step": 5550
},
{
"entropy": 1.2078125,
"epoch": 0.7598742654093208,
"grad_norm": 0.07150009767171739,
"learning_rate": 3.196773284486403e-06,
"loss": 1.212,
"mean_token_accuracy": 0.7157334327697754,
"num_tokens": 512550694.0,
"step": 5560
},
{
"entropy": 1.17109375,
"epoch": 0.7612409457427908,
"grad_norm": 0.0710948198211244,
"learning_rate": 3.1932506692968864e-06,
"loss": 1.1669,
"mean_token_accuracy": 0.7218466162681579,
"num_tokens": 513493021.0,
"step": 5570
},
{
"entropy": 1.128125,
"epoch": 0.7626076260762608,
"grad_norm": 0.09663757308640611,
"learning_rate": 3.1897280541073695e-06,
"loss": 1.1263,
"mean_token_accuracy": 0.7303380191326141,
"num_tokens": 514438780.0,
"step": 5580
},
{
"entropy": 1.15625,
"epoch": 0.7639743064097307,
"grad_norm": 0.06745736764759237,
"learning_rate": 3.1862054389178526e-06,
"loss": 1.1605,
"mean_token_accuracy": 0.7252268552780151,
"num_tokens": 515416366.0,
"step": 5590
},
{
"entropy": 1.1359375,
"epoch": 0.7653409867432007,
"grad_norm": 0.07257393408850554,
"learning_rate": 3.182682823728336e-06,
"loss": 1.1367,
"mean_token_accuracy": 0.7261465311050415,
"num_tokens": 516348299.0,
"step": 5600
},
{
"entropy": 1.134375,
"epoch": 0.7667076670766708,
"grad_norm": 0.07418779060544994,
"learning_rate": 3.1791602085388197e-06,
"loss": 1.1379,
"mean_token_accuracy": 0.7282291650772095,
"num_tokens": 517284732.0,
"step": 5610
},
{
"entropy": 1.2109375,
"epoch": 0.7680743474101408,
"grad_norm": 0.07117956807864953,
"learning_rate": 3.175637593349303e-06,
"loss": 1.2218,
"mean_token_accuracy": 0.7133888244628906,
"num_tokens": 518184206.0,
"step": 5620
},
{
"entropy": 1.184375,
"epoch": 0.7694410277436108,
"grad_norm": 0.06749614089638341,
"learning_rate": 3.172114978159786e-06,
"loss": 1.1881,
"mean_token_accuracy": 0.7184696733951569,
"num_tokens": 519133164.0,
"step": 5630
},
{
"entropy": 1.16953125,
"epoch": 0.7708077080770808,
"grad_norm": 0.07050741152121154,
"learning_rate": 3.168592362970269e-06,
"loss": 1.1803,
"mean_token_accuracy": 0.7215631544589997,
"num_tokens": 520018070.0,
"step": 5640
},
{
"entropy": 1.13984375,
"epoch": 0.7721743884105507,
"grad_norm": 0.07190567964542417,
"learning_rate": 3.165069747780753e-06,
"loss": 1.1388,
"mean_token_accuracy": 0.7274946093559265,
"num_tokens": 520910258.0,
"step": 5650
},
{
"entropy": 1.17890625,
"epoch": 0.7735410687440207,
"grad_norm": 0.07159309785585187,
"learning_rate": 3.161547132591236e-06,
"loss": 1.1854,
"mean_token_accuracy": 0.7186541020870209,
"num_tokens": 521858095.0,
"step": 5660
},
{
"entropy": 1.16953125,
"epoch": 0.7749077490774908,
"grad_norm": 0.07653131268619921,
"learning_rate": 3.1580245174017193e-06,
"loss": 1.1844,
"mean_token_accuracy": 0.7191445827484131,
"num_tokens": 522796118.0,
"step": 5670
},
{
"entropy": 1.1625,
"epoch": 0.7762744294109608,
"grad_norm": 0.07451268959126872,
"learning_rate": 3.1545019022122025e-06,
"loss": 1.1578,
"mean_token_accuracy": 0.7239885807037354,
"num_tokens": 523717529.0,
"step": 5680
},
{
"entropy": 1.18046875,
"epoch": 0.7776411097444308,
"grad_norm": 0.07131009693774584,
"learning_rate": 3.1509792870226856e-06,
"loss": 1.1923,
"mean_token_accuracy": 0.7180477917194367,
"num_tokens": 524649560.0,
"step": 5690
},
{
"entropy": 1.140625,
"epoch": 0.7790077900779008,
"grad_norm": 0.07576220796898442,
"learning_rate": 3.147456671833169e-06,
"loss": 1.1397,
"mean_token_accuracy": 0.7282308995723724,
"num_tokens": 525561680.0,
"step": 5700
},
{
"entropy": 1.1625,
"epoch": 0.7803744704113708,
"grad_norm": 0.06975079914760615,
"learning_rate": 3.1439340566436527e-06,
"loss": 1.1544,
"mean_token_accuracy": 0.7249206185340882,
"num_tokens": 526487767.0,
"step": 5710
},
{
"entropy": 1.18359375,
"epoch": 0.7817411507448407,
"grad_norm": 0.07597526211337051,
"learning_rate": 3.140411441454136e-06,
"loss": 1.1858,
"mean_token_accuracy": 0.7193028450012207,
"num_tokens": 527395760.0,
"step": 5720
},
{
"entropy": 1.1546875,
"epoch": 0.7831078310783108,
"grad_norm": 0.07032534281246436,
"learning_rate": 3.136888826264619e-06,
"loss": 1.1579,
"mean_token_accuracy": 0.7229610681533813,
"num_tokens": 528315987.0,
"step": 5730
},
{
"entropy": 1.12890625,
"epoch": 0.7844745114117808,
"grad_norm": 0.10771826171367384,
"learning_rate": 3.1333662110751025e-06,
"loss": 1.1274,
"mean_token_accuracy": 0.7298998832702637,
"num_tokens": 529222893.0,
"step": 5740
},
{
"entropy": 1.1,
"epoch": 0.7858411917452508,
"grad_norm": 0.07883267151912558,
"learning_rate": 3.1298435958855856e-06,
"loss": 1.099,
"mean_token_accuracy": 0.7352051198482513,
"num_tokens": 530101850.0,
"step": 5750
},
{
"entropy": 1.1734375,
"epoch": 0.7872078720787208,
"grad_norm": 0.07292343450305311,
"learning_rate": 3.126320980696069e-06,
"loss": 1.1749,
"mean_token_accuracy": 0.722415417432785,
"num_tokens": 531000655.0,
"step": 5760
},
{
"entropy": 1.16875,
"epoch": 0.7885745524121908,
"grad_norm": 0.07924516267138795,
"learning_rate": 3.1227983655065523e-06,
"loss": 1.1653,
"mean_token_accuracy": 0.7225272417068481,
"num_tokens": 531903053.0,
"step": 5770
},
{
"entropy": 1.134375,
"epoch": 0.7899412327456607,
"grad_norm": 0.07279918639679707,
"learning_rate": 3.119275750317036e-06,
"loss": 1.1275,
"mean_token_accuracy": 0.7286905348300934,
"num_tokens": 532817171.0,
"step": 5780
},
{
"entropy": 1.190625,
"epoch": 0.7913079130791308,
"grad_norm": 0.07600615486665434,
"learning_rate": 3.115753135127519e-06,
"loss": 1.196,
"mean_token_accuracy": 0.7151669204235077,
"num_tokens": 533748500.0,
"step": 5790
},
{
"entropy": 1.1453125,
"epoch": 0.7926745934126008,
"grad_norm": 0.07742375809980616,
"learning_rate": 3.112230519938002e-06,
"loss": 1.1437,
"mean_token_accuracy": 0.7266421914100647,
"num_tokens": 534701836.0,
"step": 5800
},
{
"entropy": 1.13515625,
"epoch": 0.7940412737460708,
"grad_norm": 0.07857596098037942,
"learning_rate": 3.108707904748485e-06,
"loss": 1.1353,
"mean_token_accuracy": 0.7291074693202972,
"num_tokens": 535560186.0,
"step": 5810
},
{
"entropy": 1.16796875,
"epoch": 0.7954079540795408,
"grad_norm": 0.06619943168955243,
"learning_rate": 3.105185289558969e-06,
"loss": 1.1765,
"mean_token_accuracy": 0.7211933553218841,
"num_tokens": 536522995.0,
"step": 5820
},
{
"entropy": 1.14296875,
"epoch": 0.7967746344130108,
"grad_norm": 0.06681479984007155,
"learning_rate": 3.1016626743694523e-06,
"loss": 1.1419,
"mean_token_accuracy": 0.7254096746444703,
"num_tokens": 537446008.0,
"step": 5830
},
{
"entropy": 1.09765625,
"epoch": 0.7981413147464808,
"grad_norm": 0.0703119236480056,
"learning_rate": 3.0981400591799354e-06,
"loss": 1.0977,
"mean_token_accuracy": 0.7351486265659333,
"num_tokens": 538421022.0,
"step": 5840
},
{
"entropy": 1.14140625,
"epoch": 0.7995079950799509,
"grad_norm": 0.06742784162061534,
"learning_rate": 3.0946174439904185e-06,
"loss": 1.1397,
"mean_token_accuracy": 0.7291741251945496,
"num_tokens": 539358105.0,
"step": 5850
},
{
"entropy": 1.15703125,
"epoch": 0.8008746754134208,
"grad_norm": 0.07535967605369452,
"learning_rate": 3.0910948288009016e-06,
"loss": 1.1542,
"mean_token_accuracy": 0.7256515920162201,
"num_tokens": 540247054.0,
"step": 5860
},
{
"entropy": 1.13046875,
"epoch": 0.8022413557468908,
"grad_norm": 0.07728433823916704,
"learning_rate": 3.0875722136113856e-06,
"loss": 1.1353,
"mean_token_accuracy": 0.7275100767612457,
"num_tokens": 541163430.0,
"step": 5870
},
{
"entropy": 1.18359375,
"epoch": 0.8036080360803608,
"grad_norm": 0.0758892853918405,
"learning_rate": 3.0840495984218687e-06,
"loss": 1.1811,
"mean_token_accuracy": 0.7210020005702973,
"num_tokens": 542107927.0,
"step": 5880
},
{
"entropy": 1.1671875,
"epoch": 0.8049747164138308,
"grad_norm": 0.07939640702054968,
"learning_rate": 3.080526983232352e-06,
"loss": 1.1684,
"mean_token_accuracy": 0.7217608213424682,
"num_tokens": 543019413.0,
"step": 5890
},
{
"entropy": 1.17578125,
"epoch": 0.8063413967473008,
"grad_norm": 0.06858663662518262,
"learning_rate": 3.0770043680428354e-06,
"loss": 1.1738,
"mean_token_accuracy": 0.7212044715881347,
"num_tokens": 543947382.0,
"step": 5900
},
{
"entropy": 1.1421875,
"epoch": 0.8077080770807709,
"grad_norm": 0.07766636135758749,
"learning_rate": 3.0734817528533185e-06,
"loss": 1.1412,
"mean_token_accuracy": 0.7284856259822845,
"num_tokens": 544847561.0,
"step": 5910
},
{
"entropy": 1.19375,
"epoch": 0.8090747574142408,
"grad_norm": 0.07869539161508957,
"learning_rate": 3.0699591376638017e-06,
"loss": 1.201,
"mean_token_accuracy": 0.7165490090847015,
"num_tokens": 545757015.0,
"step": 5920
},
{
"entropy": 1.14140625,
"epoch": 0.8104414377477108,
"grad_norm": 0.07567622282066397,
"learning_rate": 3.066436522474285e-06,
"loss": 1.1315,
"mean_token_accuracy": 0.7311730861663819,
"num_tokens": 546673248.0,
"step": 5930
},
{
"entropy": 1.098828125,
"epoch": 0.8118081180811808,
"grad_norm": 0.06606729275194072,
"learning_rate": 3.0629139072847688e-06,
"loss": 1.1026,
"mean_token_accuracy": 0.7339667439460754,
"num_tokens": 547651025.0,
"step": 5940
},
{
"entropy": 1.19375,
"epoch": 0.8131747984146508,
"grad_norm": 0.07694967284462523,
"learning_rate": 3.059391292095252e-06,
"loss": 1.1971,
"mean_token_accuracy": 0.7170174837112426,
"num_tokens": 548553494.0,
"step": 5950
},
{
"entropy": 1.17578125,
"epoch": 0.8145414787481208,
"grad_norm": 0.07349659990835351,
"learning_rate": 3.055868676905735e-06,
"loss": 1.1808,
"mean_token_accuracy": 0.7214943706989289,
"num_tokens": 549523321.0,
"step": 5960
},
{
"entropy": 1.19609375,
"epoch": 0.8159081590815909,
"grad_norm": 0.08299466648334716,
"learning_rate": 3.052346061716218e-06,
"loss": 1.2015,
"mean_token_accuracy": 0.7172509908676148,
"num_tokens": 550382497.0,
"step": 5970
},
{
"entropy": 1.15078125,
"epoch": 0.8172748394150608,
"grad_norm": 0.0767952839872855,
"learning_rate": 3.048823446526702e-06,
"loss": 1.1566,
"mean_token_accuracy": 0.7244634866714478,
"num_tokens": 551315192.0,
"step": 5980
},
{
"entropy": 1.1734375,
"epoch": 0.8186415197485308,
"grad_norm": 0.07333878173178052,
"learning_rate": 3.0453008313371852e-06,
"loss": 1.1761,
"mean_token_accuracy": 0.7217873811721802,
"num_tokens": 552283477.0,
"step": 5990
},
{
"entropy": 1.14921875,
"epoch": 0.8200082000820008,
"grad_norm": 0.07485048673356857,
"learning_rate": 3.0417782161476683e-06,
"loss": 1.1576,
"mean_token_accuracy": 0.7239622354507447,
"num_tokens": 553194884.0,
"step": 6000
},
{
"entropy": 1.20625,
"epoch": 0.8213748804154708,
"grad_norm": 0.07719431531068228,
"learning_rate": 3.0382556009581515e-06,
"loss": 1.2195,
"mean_token_accuracy": 0.7135745644569397,
"num_tokens": 554107536.0,
"step": 6010
},
{
"entropy": 1.1453125,
"epoch": 0.8227415607489408,
"grad_norm": 0.0720062155707693,
"learning_rate": 3.0347329857686346e-06,
"loss": 1.1274,
"mean_token_accuracy": 0.7282998621463775,
"num_tokens": 554994790.0,
"step": 6020
},
{
"entropy": 1.134375,
"epoch": 0.8241082410824109,
"grad_norm": 0.07881673602124045,
"learning_rate": 3.031210370579118e-06,
"loss": 1.1437,
"mean_token_accuracy": 0.7291691124439239,
"num_tokens": 555917889.0,
"step": 6030
},
{
"entropy": 1.11875,
"epoch": 0.8254749214158809,
"grad_norm": 0.08346569933312713,
"learning_rate": 3.0276877553896017e-06,
"loss": 1.1113,
"mean_token_accuracy": 0.7311753988265991,
"num_tokens": 556842526.0,
"step": 6040
},
{
"entropy": 1.16484375,
"epoch": 0.8268416017493508,
"grad_norm": 0.07738394330575885,
"learning_rate": 3.024165140200085e-06,
"loss": 1.1669,
"mean_token_accuracy": 0.7228632867336273,
"num_tokens": 557755256.0,
"step": 6050
},
{
"entropy": 1.1734375,
"epoch": 0.8282082820828208,
"grad_norm": 0.08261811464392206,
"learning_rate": 3.020642525010568e-06,
"loss": 1.1801,
"mean_token_accuracy": 0.721024078130722,
"num_tokens": 558660408.0,
"step": 6060
},
{
"entropy": 1.15625,
"epoch": 0.8295749624162908,
"grad_norm": 0.07560773725191987,
"learning_rate": 3.0171199098210515e-06,
"loss": 1.1502,
"mean_token_accuracy": 0.7261201798915863,
"num_tokens": 559553626.0,
"step": 6070
},
{
"entropy": 1.15625,
"epoch": 0.8309416427497608,
"grad_norm": 0.07338628908653562,
"learning_rate": 3.0135972946315346e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7244411110877991,
"num_tokens": 560480649.0,
"step": 6080
},
{
"entropy": 1.16875,
"epoch": 0.8323083230832309,
"grad_norm": 0.0702562707652083,
"learning_rate": 3.010074679442018e-06,
"loss": 1.1641,
"mean_token_accuracy": 0.7231172621250153,
"num_tokens": 561376221.0,
"step": 6090
},
{
"entropy": 1.18046875,
"epoch": 0.8336750034167009,
"grad_norm": 0.07592461091451838,
"learning_rate": 3.0065520642525013e-06,
"loss": 1.1848,
"mean_token_accuracy": 0.7205139338970185,
"num_tokens": 562274400.0,
"step": 6100
},
{
"entropy": 1.18046875,
"epoch": 0.8350416837501708,
"grad_norm": 0.07729128439614716,
"learning_rate": 3.003029449062985e-06,
"loss": 1.173,
"mean_token_accuracy": 0.7223123848438263,
"num_tokens": 563215626.0,
"step": 6110
},
{
"entropy": 1.1578125,
"epoch": 0.8364083640836408,
"grad_norm": 0.07683984040342311,
"learning_rate": 2.999506833873468e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.7242967545986175,
"num_tokens": 564091313.0,
"step": 6120
},
{
"entropy": 1.1859375,
"epoch": 0.8377750444171108,
"grad_norm": 0.06884069075393245,
"learning_rate": 2.995984218683951e-06,
"loss": 1.1774,
"mean_token_accuracy": 0.7205722153186798,
"num_tokens": 565012295.0,
"step": 6130
},
{
"entropy": 1.137890625,
"epoch": 0.8391417247505808,
"grad_norm": 0.077342509454852,
"learning_rate": 2.992461603494434e-06,
"loss": 1.13,
"mean_token_accuracy": 0.7276026904582977,
"num_tokens": 565954544.0,
"step": 6140
},
{
"entropy": 1.1484375,
"epoch": 0.8405084050840509,
"grad_norm": 0.10086786627453855,
"learning_rate": 2.988938988304918e-06,
"loss": 1.1467,
"mean_token_accuracy": 0.7256333887577057,
"num_tokens": 566894690.0,
"step": 6150
},
{
"entropy": 1.136328125,
"epoch": 0.8418750854175209,
"grad_norm": 0.06886963598355866,
"learning_rate": 2.9854163731154013e-06,
"loss": 1.1396,
"mean_token_accuracy": 0.7297028183937073,
"num_tokens": 567803082.0,
"step": 6160
},
{
"entropy": 1.17109375,
"epoch": 0.8432417657509909,
"grad_norm": 0.0718620468168916,
"learning_rate": 2.9818937579258844e-06,
"loss": 1.1787,
"mean_token_accuracy": 0.7203066885471344,
"num_tokens": 568676822.0,
"step": 6170
},
{
"entropy": 1.14296875,
"epoch": 0.8446084460844608,
"grad_norm": 0.07617988050738174,
"learning_rate": 2.9783711427363675e-06,
"loss": 1.145,
"mean_token_accuracy": 0.7258231997489929,
"num_tokens": 569602337.0,
"step": 6180
},
{
"entropy": 1.17109375,
"epoch": 0.8459751264179308,
"grad_norm": 0.07786581622504642,
"learning_rate": 2.9748485275468507e-06,
"loss": 1.1698,
"mean_token_accuracy": 0.7239695847034454,
"num_tokens": 570513919.0,
"step": 6190
},
{
"entropy": 1.15078125,
"epoch": 0.8473418067514008,
"grad_norm": 0.07342249360709947,
"learning_rate": 2.9713259123573346e-06,
"loss": 1.1649,
"mean_token_accuracy": 0.7239412426948547,
"num_tokens": 571394215.0,
"step": 6200
},
{
"entropy": 1.15078125,
"epoch": 0.8487084870848709,
"grad_norm": 0.07271934476407611,
"learning_rate": 2.9678032971678177e-06,
"loss": 1.1564,
"mean_token_accuracy": 0.7230813086032868,
"num_tokens": 572265499.0,
"step": 6210
},
{
"entropy": 1.15625,
"epoch": 0.8500751674183409,
"grad_norm": 0.07272395582528682,
"learning_rate": 2.964280681978301e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.7212814569473267,
"num_tokens": 573215815.0,
"step": 6220
},
{
"entropy": 1.121875,
"epoch": 0.8514418477518109,
"grad_norm": 0.07871713787463883,
"learning_rate": 2.960758066788784e-06,
"loss": 1.1167,
"mean_token_accuracy": 0.7309862792491912,
"num_tokens": 574095844.0,
"step": 6230
},
{
"entropy": 1.18203125,
"epoch": 0.8528085280852808,
"grad_norm": 0.07640271885196863,
"learning_rate": 2.9572354515992675e-06,
"loss": 1.174,
"mean_token_accuracy": 0.7199901640415192,
"num_tokens": 574990843.0,
"step": 6240
},
{
"entropy": 1.09921875,
"epoch": 0.8541752084187508,
"grad_norm": 0.08102711909145398,
"learning_rate": 2.9537128364097507e-06,
"loss": 1.0913,
"mean_token_accuracy": 0.7384831070899963,
"num_tokens": 575911166.0,
"step": 6250
},
{
"entropy": 1.15625,
"epoch": 0.8555418887522208,
"grad_norm": 0.06587881907933123,
"learning_rate": 2.9501902212202342e-06,
"loss": 1.1535,
"mean_token_accuracy": 0.725889015197754,
"num_tokens": 576865158.0,
"step": 6260
},
{
"entropy": 1.1640625,
"epoch": 0.8569085690856909,
"grad_norm": 0.07316512695472475,
"learning_rate": 2.9466676060307178e-06,
"loss": 1.1705,
"mean_token_accuracy": 0.72081059217453,
"num_tokens": 577798743.0,
"step": 6270
},
{
"entropy": 1.17578125,
"epoch": 0.8582752494191609,
"grad_norm": 0.0729407657251821,
"learning_rate": 2.943144990841201e-06,
"loss": 1.1812,
"mean_token_accuracy": 0.7207557499408722,
"num_tokens": 578736942.0,
"step": 6280
},
{
"entropy": 1.16484375,
"epoch": 0.8596419297526309,
"grad_norm": 0.07298591793078872,
"learning_rate": 2.939622375651684e-06,
"loss": 1.1569,
"mean_token_accuracy": 0.7253593623638153,
"num_tokens": 579657758.0,
"step": 6290
},
{
"entropy": 1.16796875,
"epoch": 0.8610086100861009,
"grad_norm": 0.0811839759620073,
"learning_rate": 2.936099760462167e-06,
"loss": 1.1689,
"mean_token_accuracy": 0.7227496087551117,
"num_tokens": 580624116.0,
"step": 6300
},
{
"entropy": 1.14140625,
"epoch": 0.8623752904195708,
"grad_norm": 0.0776888328357935,
"learning_rate": 2.932577145272651e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.7276157081127167,
"num_tokens": 581517703.0,
"step": 6310
},
{
"entropy": 1.175390625,
"epoch": 0.8637419707530408,
"grad_norm": 0.06884941607901747,
"learning_rate": 2.9290545300831342e-06,
"loss": 1.177,
"mean_token_accuracy": 0.7206972956657409,
"num_tokens": 582458745.0,
"step": 6320
},
{
"entropy": 1.190625,
"epoch": 0.8651086510865109,
"grad_norm": 0.12460829910922243,
"learning_rate": 2.9255319148936174e-06,
"loss": 1.1806,
"mean_token_accuracy": 0.7209498167037964,
"num_tokens": 583377378.0,
"step": 6330
},
{
"entropy": 1.1109375,
"epoch": 0.8664753314199809,
"grad_norm": 0.07285485878858074,
"learning_rate": 2.9220092997041005e-06,
"loss": 1.105,
"mean_token_accuracy": 0.7363126754760743,
"num_tokens": 584312704.0,
"step": 6340
},
{
"entropy": 1.15859375,
"epoch": 0.8678420117534509,
"grad_norm": 0.070515525857349,
"learning_rate": 2.9184866845145836e-06,
"loss": 1.1507,
"mean_token_accuracy": 0.7241629064083099,
"num_tokens": 585234665.0,
"step": 6350
},
{
"entropy": 1.142578125,
"epoch": 0.8692086920869209,
"grad_norm": 0.07829146628187363,
"learning_rate": 2.9149640693250667e-06,
"loss": 1.1374,
"mean_token_accuracy": 0.7269978404045105,
"num_tokens": 586185228.0,
"step": 6360
},
{
"entropy": 1.14140625,
"epoch": 0.8705753724203908,
"grad_norm": 0.07732755179568113,
"learning_rate": 2.9114414541355507e-06,
"loss": 1.1416,
"mean_token_accuracy": 0.7261748433113098,
"num_tokens": 587089018.0,
"step": 6370
},
{
"entropy": 1.17421875,
"epoch": 0.8719420527538608,
"grad_norm": 0.0695673673393698,
"learning_rate": 2.907918838946034e-06,
"loss": 1.172,
"mean_token_accuracy": 0.7232531905174255,
"num_tokens": 588017673.0,
"step": 6380
},
{
"entropy": 1.14921875,
"epoch": 0.8733087330873309,
"grad_norm": 0.0812768270329388,
"learning_rate": 2.904396223756517e-06,
"loss": 1.1473,
"mean_token_accuracy": 0.7274377763271331,
"num_tokens": 588935725.0,
"step": 6390
},
{
"entropy": 1.16171875,
"epoch": 0.8746754134208009,
"grad_norm": 0.07633502857947162,
"learning_rate": 2.9008736085670005e-06,
"loss": 1.1612,
"mean_token_accuracy": 0.7230794906616211,
"num_tokens": 589851938.0,
"step": 6400
},
{
"entropy": 1.109765625,
"epoch": 0.8760420937542709,
"grad_norm": 0.07960933343282967,
"learning_rate": 2.8973509933774836e-06,
"loss": 1.0994,
"mean_token_accuracy": 0.7346998393535614,
"num_tokens": 590750641.0,
"step": 6410
},
{
"entropy": 1.1671875,
"epoch": 0.8774087740877409,
"grad_norm": 0.07061586029617532,
"learning_rate": 2.893828378187967e-06,
"loss": 1.1785,
"mean_token_accuracy": 0.7210073113441468,
"num_tokens": 591691593.0,
"step": 6420
},
{
"entropy": 1.15,
"epoch": 0.8787754544212109,
"grad_norm": 0.07825092753032652,
"learning_rate": 2.8903057629984503e-06,
"loss": 1.1511,
"mean_token_accuracy": 0.7250349044799804,
"num_tokens": 592586272.0,
"step": 6430
},
{
"entropy": 1.14296875,
"epoch": 0.8801421347546808,
"grad_norm": 0.07552448334955235,
"learning_rate": 2.886783147808934e-06,
"loss": 1.1462,
"mean_token_accuracy": 0.727625173330307,
"num_tokens": 593523306.0,
"step": 6440
},
{
"entropy": 1.203125,
"epoch": 0.8815088150881509,
"grad_norm": 0.07156558480861205,
"learning_rate": 2.883260532619417e-06,
"loss": 1.2037,
"mean_token_accuracy": 0.7152089774608612,
"num_tokens": 594414758.0,
"step": 6450
},
{
"entropy": 1.15390625,
"epoch": 0.8828754954216209,
"grad_norm": 0.08239122406326786,
"learning_rate": 2.8797379174299e-06,
"loss": 1.1403,
"mean_token_accuracy": 0.7277523875236511,
"num_tokens": 595285971.0,
"step": 6460
},
{
"entropy": 1.10234375,
"epoch": 0.8842421757550909,
"grad_norm": 0.07283102123469033,
"learning_rate": 2.876215302240383e-06,
"loss": 1.0952,
"mean_token_accuracy": 0.7385133683681488,
"num_tokens": 596196999.0,
"step": 6470
},
{
"entropy": 1.175,
"epoch": 0.8856088560885609,
"grad_norm": 0.0833426443925609,
"learning_rate": 2.872692687050867e-06,
"loss": 1.1633,
"mean_token_accuracy": 0.7231970369815827,
"num_tokens": 597148890.0,
"step": 6480
},
{
"entropy": 1.1375,
"epoch": 0.8869755364220309,
"grad_norm": 0.06894927543344626,
"learning_rate": 2.8691700718613503e-06,
"loss": 1.1366,
"mean_token_accuracy": 0.7269395172595978,
"num_tokens": 598063968.0,
"step": 6490
},
{
"entropy": 1.1421875,
"epoch": 0.8883422167555008,
"grad_norm": 0.07397925635822189,
"learning_rate": 2.8656474566718334e-06,
"loss": 1.1445,
"mean_token_accuracy": 0.7262307107448578,
"num_tokens": 599016906.0,
"step": 6500
},
{
"entropy": 1.12421875,
"epoch": 0.8897088970889709,
"grad_norm": 0.06927397752944968,
"learning_rate": 2.8621248414823165e-06,
"loss": 1.1193,
"mean_token_accuracy": 0.7299073219299317,
"num_tokens": 599932323.0,
"step": 6510
},
{
"entropy": 1.15390625,
"epoch": 0.8910755774224409,
"grad_norm": 0.08150988524399752,
"learning_rate": 2.8586022262927997e-06,
"loss": 1.1426,
"mean_token_accuracy": 0.7262343883514404,
"num_tokens": 600787290.0,
"step": 6520
},
{
"entropy": 1.209375,
"epoch": 0.8924422577559109,
"grad_norm": 0.09532850407177261,
"learning_rate": 2.8550796111032836e-06,
"loss": 1.2064,
"mean_token_accuracy": 0.7146926581859588,
"num_tokens": 601718435.0,
"step": 6530
},
{
"entropy": 1.171875,
"epoch": 0.8938089380893809,
"grad_norm": 0.08009150526697785,
"learning_rate": 2.8515569959137668e-06,
"loss": 1.1826,
"mean_token_accuracy": 0.7195375502109528,
"num_tokens": 602629379.0,
"step": 6540
},
{
"entropy": 1.13828125,
"epoch": 0.8951756184228509,
"grad_norm": 0.07217378695583132,
"learning_rate": 2.84803438072425e-06,
"loss": 1.144,
"mean_token_accuracy": 0.7284374356269836,
"num_tokens": 603541464.0,
"step": 6550
},
{
"entropy": 1.13828125,
"epoch": 0.8965422987563209,
"grad_norm": 0.0725480970685191,
"learning_rate": 2.844511765534733e-06,
"loss": 1.1439,
"mean_token_accuracy": 0.7268552243709564,
"num_tokens": 604455855.0,
"step": 6560
},
{
"entropy": 1.10078125,
"epoch": 0.897908979089791,
"grad_norm": 0.06422979699653435,
"learning_rate": 2.8409891503452166e-06,
"loss": 1.0923,
"mean_token_accuracy": 0.7365232169628143,
"num_tokens": 605395006.0,
"step": 6570
},
{
"entropy": 1.16484375,
"epoch": 0.8992756594232609,
"grad_norm": 0.07449715970366734,
"learning_rate": 2.8374665351556997e-06,
"loss": 1.1702,
"mean_token_accuracy": 0.721227490901947,
"num_tokens": 606286202.0,
"step": 6580
},
{
"entropy": 1.16171875,
"epoch": 0.9006423397567309,
"grad_norm": 0.07272483947306195,
"learning_rate": 2.8339439199661832e-06,
"loss": 1.1621,
"mean_token_accuracy": 0.7247979402542114,
"num_tokens": 607248617.0,
"step": 6590
},
{
"entropy": 1.169140625,
"epoch": 0.9020090200902009,
"grad_norm": 0.0737655216367567,
"learning_rate": 2.8304213047766663e-06,
"loss": 1.1794,
"mean_token_accuracy": 0.7198045134544373,
"num_tokens": 608137958.0,
"step": 6600
},
{
"entropy": 1.11171875,
"epoch": 0.9033757004236709,
"grad_norm": 0.0716355089354242,
"learning_rate": 2.82689868958715e-06,
"loss": 1.1114,
"mean_token_accuracy": 0.7329358577728271,
"num_tokens": 609065549.0,
"step": 6610
},
{
"entropy": 1.121484375,
"epoch": 0.9047423807571409,
"grad_norm": 0.07543679131211123,
"learning_rate": 2.823376074397633e-06,
"loss": 1.1178,
"mean_token_accuracy": 0.7303046405315399,
"num_tokens": 609996904.0,
"step": 6620
},
{
"entropy": 1.1453125,
"epoch": 0.906109061090611,
"grad_norm": 0.07522460960640055,
"learning_rate": 2.819853459208116e-06,
"loss": 1.1371,
"mean_token_accuracy": 0.728844267129898,
"num_tokens": 610881045.0,
"step": 6630
},
{
"entropy": 1.19453125,
"epoch": 0.9074757414240809,
"grad_norm": 0.08273212987479597,
"learning_rate": 2.8163308440186e-06,
"loss": 1.2011,
"mean_token_accuracy": 0.7167985022068024,
"num_tokens": 611806746.0,
"step": 6640
},
{
"entropy": 1.16796875,
"epoch": 0.9088424217575509,
"grad_norm": 0.07630454437787305,
"learning_rate": 2.8128082288290832e-06,
"loss": 1.1655,
"mean_token_accuracy": 0.7230241358280182,
"num_tokens": 612709434.0,
"step": 6650
},
{
"entropy": 1.159375,
"epoch": 0.9102091020910209,
"grad_norm": 0.07699932183213455,
"learning_rate": 2.8092856136395664e-06,
"loss": 1.1503,
"mean_token_accuracy": 0.7254749894142151,
"num_tokens": 613639390.0,
"step": 6660
},
{
"entropy": 1.15390625,
"epoch": 0.9115757824244909,
"grad_norm": 0.07577966442811876,
"learning_rate": 2.8057629984500495e-06,
"loss": 1.1554,
"mean_token_accuracy": 0.7228997766971588,
"num_tokens": 614605380.0,
"step": 6670
},
{
"entropy": 1.142578125,
"epoch": 0.9129424627579609,
"grad_norm": 0.07166293623146275,
"learning_rate": 2.8022403832605326e-06,
"loss": 1.1306,
"mean_token_accuracy": 0.7293377697467804,
"num_tokens": 615539760.0,
"step": 6680
},
{
"entropy": 1.1625,
"epoch": 0.914309143091431,
"grad_norm": 0.07698639962011997,
"learning_rate": 2.7987177680710157e-06,
"loss": 1.1668,
"mean_token_accuracy": 0.724218875169754,
"num_tokens": 616477576.0,
"step": 6690
},
{
"entropy": 1.15,
"epoch": 0.915675823424901,
"grad_norm": 0.07917299136578755,
"learning_rate": 2.7951951528814997e-06,
"loss": 1.1576,
"mean_token_accuracy": 0.7231554329395294,
"num_tokens": 617421890.0,
"step": 6700
},
{
"entropy": 1.162109375,
"epoch": 0.9170425037583709,
"grad_norm": 0.07094898308298138,
"learning_rate": 2.791672537691983e-06,
"loss": 1.1803,
"mean_token_accuracy": 0.7191467821598053,
"num_tokens": 618393590.0,
"step": 6710
},
{
"entropy": 1.159375,
"epoch": 0.9184091840918409,
"grad_norm": 0.07952976358986978,
"learning_rate": 2.788149922502466e-06,
"loss": 1.1583,
"mean_token_accuracy": 0.7257636487483978,
"num_tokens": 619296207.0,
"step": 6720
},
{
"entropy": 1.146484375,
"epoch": 0.9197758644253109,
"grad_norm": 0.1056166166441296,
"learning_rate": 2.784627307312949e-06,
"loss": 1.1554,
"mean_token_accuracy": 0.725968861579895,
"num_tokens": 620235237.0,
"step": 6730
},
{
"entropy": 1.12421875,
"epoch": 0.9211425447587809,
"grad_norm": 0.0691089051597861,
"learning_rate": 2.7811046921234326e-06,
"loss": 1.113,
"mean_token_accuracy": 0.7323518693447113,
"num_tokens": 621157947.0,
"step": 6740
},
{
"entropy": 1.1859375,
"epoch": 0.922509225092251,
"grad_norm": 0.0720567406454081,
"learning_rate": 2.777582076933916e-06,
"loss": 1.1828,
"mean_token_accuracy": 0.7206071972846985,
"num_tokens": 622061617.0,
"step": 6750
},
{
"entropy": 1.1625,
"epoch": 0.923875905425721,
"grad_norm": 0.06451797424654195,
"learning_rate": 2.7740594617443993e-06,
"loss": 1.1577,
"mean_token_accuracy": 0.723643684387207,
"num_tokens": 623006404.0,
"step": 6760
},
{
"entropy": 1.14375,
"epoch": 0.9252425857591909,
"grad_norm": 0.07190608984775515,
"learning_rate": 2.770536846554883e-06,
"loss": 1.1493,
"mean_token_accuracy": 0.7269912242889405,
"num_tokens": 623971191.0,
"step": 6770
},
{
"entropy": 1.175390625,
"epoch": 0.9266092660926609,
"grad_norm": 0.07357543153049992,
"learning_rate": 2.767014231365366e-06,
"loss": 1.1875,
"mean_token_accuracy": 0.7198644340038299,
"num_tokens": 624883123.0,
"step": 6780
},
{
"entropy": 1.1609375,
"epoch": 0.9279759464261309,
"grad_norm": 0.07014050234508871,
"learning_rate": 2.763491616175849e-06,
"loss": 1.1594,
"mean_token_accuracy": 0.7241424441337585,
"num_tokens": 625850233.0,
"step": 6790
},
{
"entropy": 1.12734375,
"epoch": 0.9293426267596009,
"grad_norm": 0.08081917588977887,
"learning_rate": 2.759969000986332e-06,
"loss": 1.1222,
"mean_token_accuracy": 0.730657833814621,
"num_tokens": 626789700.0,
"step": 6800
},
{
"entropy": 1.10078125,
"epoch": 0.930709307093071,
"grad_norm": 0.07130369033230517,
"learning_rate": 2.756446385796816e-06,
"loss": 1.0996,
"mean_token_accuracy": 0.7349979639053345,
"num_tokens": 627697328.0,
"step": 6810
},
{
"entropy": 1.134375,
"epoch": 0.932075987426541,
"grad_norm": 0.06749430798617702,
"learning_rate": 2.7529237706072993e-06,
"loss": 1.1298,
"mean_token_accuracy": 0.7293443500995636,
"num_tokens": 628630914.0,
"step": 6820
},
{
"entropy": 1.13359375,
"epoch": 0.933442667760011,
"grad_norm": 0.07343397717281465,
"learning_rate": 2.7494011554177824e-06,
"loss": 1.1289,
"mean_token_accuracy": 0.73056401014328,
"num_tokens": 629547186.0,
"step": 6830
},
{
"entropy": 1.1921875,
"epoch": 0.9348093480934809,
"grad_norm": 0.07625501264876987,
"learning_rate": 2.7458785402282656e-06,
"loss": 1.1915,
"mean_token_accuracy": 0.7186883151531219,
"num_tokens": 630457312.0,
"step": 6840
},
{
"entropy": 1.196875,
"epoch": 0.9361760284269509,
"grad_norm": 0.07592728686199672,
"learning_rate": 2.7423559250387487e-06,
"loss": 1.1939,
"mean_token_accuracy": 0.7179849803447723,
"num_tokens": 631376333.0,
"step": 6850
},
{
"entropy": 1.15546875,
"epoch": 0.9375427087604209,
"grad_norm": 0.07165124766701778,
"learning_rate": 2.7388333098492326e-06,
"loss": 1.1368,
"mean_token_accuracy": 0.728828901052475,
"num_tokens": 632240772.0,
"step": 6860
},
{
"entropy": 1.16875,
"epoch": 0.938909389093891,
"grad_norm": 0.10596453583296306,
"learning_rate": 2.7353106946597158e-06,
"loss": 1.166,
"mean_token_accuracy": 0.7230931997299195,
"num_tokens": 633212586.0,
"step": 6870
},
{
"entropy": 1.1828125,
"epoch": 0.940276069427361,
"grad_norm": 0.07868564791527365,
"learning_rate": 2.731788079470199e-06,
"loss": 1.1988,
"mean_token_accuracy": 0.7158521234989166,
"num_tokens": 634093379.0,
"step": 6880
},
{
"entropy": 1.16953125,
"epoch": 0.941642749760831,
"grad_norm": 0.06997600530451786,
"learning_rate": 2.728265464280682e-06,
"loss": 1.1813,
"mean_token_accuracy": 0.7212384939193726,
"num_tokens": 634995635.0,
"step": 6890
},
{
"entropy": 1.16015625,
"epoch": 0.9430094300943009,
"grad_norm": 0.07319397412789987,
"learning_rate": 2.7247428490911656e-06,
"loss": 1.1632,
"mean_token_accuracy": 0.724156665802002,
"num_tokens": 635882762.0,
"step": 6900
},
{
"entropy": 1.15859375,
"epoch": 0.9443761104277709,
"grad_norm": 0.0743559553034425,
"learning_rate": 2.7212202339016487e-06,
"loss": 1.1688,
"mean_token_accuracy": 0.7234509110450744,
"num_tokens": 636815111.0,
"step": 6910
},
{
"entropy": 1.140625,
"epoch": 0.9457427907612409,
"grad_norm": 0.07036643591517219,
"learning_rate": 2.7176976187121322e-06,
"loss": 1.1486,
"mean_token_accuracy": 0.7263337314128876,
"num_tokens": 637800464.0,
"step": 6920
},
{
"entropy": 1.17265625,
"epoch": 0.947109471094711,
"grad_norm": 0.06915703291358567,
"learning_rate": 2.7141750035226154e-06,
"loss": 1.1719,
"mean_token_accuracy": 0.7210862159729003,
"num_tokens": 638762414.0,
"step": 6930
},
{
"entropy": 1.17890625,
"epoch": 0.948476151428181,
"grad_norm": 0.08377506024986504,
"learning_rate": 2.710652388333099e-06,
"loss": 1.1763,
"mean_token_accuracy": 0.7213742434978485,
"num_tokens": 639708115.0,
"step": 6940
},
{
"entropy": 1.15078125,
"epoch": 0.949842831761651,
"grad_norm": 0.07010811358034727,
"learning_rate": 2.707129773143582e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7229729115962982,
"num_tokens": 640669487.0,
"step": 6950
},
{
"entropy": 1.121875,
"epoch": 0.951209512095121,
"grad_norm": 0.0721513371057462,
"learning_rate": 2.703607157954065e-06,
"loss": 1.1256,
"mean_token_accuracy": 0.7298852741718292,
"num_tokens": 641609525.0,
"step": 6960
},
{
"entropy": 1.19609375,
"epoch": 0.9525761924285909,
"grad_norm": 0.0782423356340587,
"learning_rate": 2.7000845427645483e-06,
"loss": 1.1961,
"mean_token_accuracy": 0.7170902609825134,
"num_tokens": 642492862.0,
"step": 6970
},
{
"entropy": 1.1234375,
"epoch": 0.9539428727620609,
"grad_norm": 0.07082446319850622,
"learning_rate": 2.6965619275750322e-06,
"loss": 1.1314,
"mean_token_accuracy": 0.7299052953720093,
"num_tokens": 643397187.0,
"step": 6980
},
{
"entropy": 1.1890625,
"epoch": 0.955309553095531,
"grad_norm": 0.07789343180973504,
"learning_rate": 2.6930393123855154e-06,
"loss": 1.1989,
"mean_token_accuracy": 0.7178477466106414,
"num_tokens": 644337919.0,
"step": 6990
},
{
"entropy": 1.20625,
"epoch": 0.956676233429001,
"grad_norm": 0.07295848577511418,
"learning_rate": 2.6895166971959985e-06,
"loss": 1.2118,
"mean_token_accuracy": 0.7139837861061096,
"num_tokens": 645236378.0,
"step": 7000
},
{
"entropy": 1.11796875,
"epoch": 0.958042913762471,
"grad_norm": 0.07432988726679955,
"learning_rate": 2.6859940820064816e-06,
"loss": 1.1176,
"mean_token_accuracy": 0.7333408117294311,
"num_tokens": 646140990.0,
"step": 7010
},
{
"entropy": 1.1953125,
"epoch": 0.959409594095941,
"grad_norm": 0.08381762545840467,
"learning_rate": 2.6824714668169647e-06,
"loss": 1.1944,
"mean_token_accuracy": 0.7183784186840058,
"num_tokens": 647048468.0,
"step": 7020
},
{
"entropy": 1.16875,
"epoch": 0.9607762744294109,
"grad_norm": 0.06940000750973872,
"learning_rate": 2.6789488516274487e-06,
"loss": 1.1751,
"mean_token_accuracy": 0.7222676396369934,
"num_tokens": 648040312.0,
"step": 7030
},
{
"entropy": 1.16875,
"epoch": 0.9621429547628809,
"grad_norm": 0.07620969946933621,
"learning_rate": 2.675426236437932e-06,
"loss": 1.1748,
"mean_token_accuracy": 0.722524619102478,
"num_tokens": 649004460.0,
"step": 7040
},
{
"entropy": 1.13203125,
"epoch": 0.963509635096351,
"grad_norm": 0.07247901036486443,
"learning_rate": 2.671903621248415e-06,
"loss": 1.1413,
"mean_token_accuracy": 0.7258232116699219,
"num_tokens": 649898524.0,
"step": 7050
},
{
"entropy": 1.1328125,
"epoch": 0.964876315429821,
"grad_norm": 0.07682279944536655,
"learning_rate": 2.668381006058898e-06,
"loss": 1.1279,
"mean_token_accuracy": 0.7274285078048706,
"num_tokens": 650801130.0,
"step": 7060
},
{
"entropy": 1.16953125,
"epoch": 0.966242995763291,
"grad_norm": 0.07658444350500243,
"learning_rate": 2.6648583908693816e-06,
"loss": 1.1618,
"mean_token_accuracy": 0.7236790835857392,
"num_tokens": 651718796.0,
"step": 7070
},
{
"entropy": 1.19140625,
"epoch": 0.967609676096761,
"grad_norm": 0.08259666630088373,
"learning_rate": 2.6613357756798648e-06,
"loss": 1.1974,
"mean_token_accuracy": 0.7160954117774964,
"num_tokens": 652638787.0,
"step": 7080
},
{
"entropy": 1.1515625,
"epoch": 0.968976356430231,
"grad_norm": 0.07410801827038271,
"learning_rate": 2.6578131604903483e-06,
"loss": 1.1559,
"mean_token_accuracy": 0.7251761257648468,
"num_tokens": 653525666.0,
"step": 7090
},
{
"entropy": 1.146875,
"epoch": 0.9703430367637009,
"grad_norm": 0.07451510556988092,
"learning_rate": 2.6542905453008314e-06,
"loss": 1.1509,
"mean_token_accuracy": 0.7267544746398926,
"num_tokens": 654449752.0,
"step": 7100
},
{
"entropy": 1.1734375,
"epoch": 0.971709717097171,
"grad_norm": 0.06903653829129948,
"learning_rate": 2.650767930111315e-06,
"loss": 1.1668,
"mean_token_accuracy": 0.7239104449748993,
"num_tokens": 655381010.0,
"step": 7110
},
{
"entropy": 1.16328125,
"epoch": 0.973076397430641,
"grad_norm": 0.07338914217131852,
"learning_rate": 2.647245314921798e-06,
"loss": 1.1599,
"mean_token_accuracy": 0.7244812548160553,
"num_tokens": 656339372.0,
"step": 7120
},
{
"entropy": 1.13828125,
"epoch": 0.974443077764111,
"grad_norm": 0.07109604992595774,
"learning_rate": 2.6437226997322812e-06,
"loss": 1.1302,
"mean_token_accuracy": 0.7301978647708893,
"num_tokens": 657308740.0,
"step": 7130
},
{
"entropy": 1.1296875,
"epoch": 0.975809758097581,
"grad_norm": 0.07203646422864121,
"learning_rate": 2.640200084542765e-06,
"loss": 1.1238,
"mean_token_accuracy": 0.7293971896171569,
"num_tokens": 658245328.0,
"step": 7140
},
{
"entropy": 1.162109375,
"epoch": 0.977176438431051,
"grad_norm": 0.07736550004179239,
"learning_rate": 2.6366774693532483e-06,
"loss": 1.1729,
"mean_token_accuracy": 0.7222187757492066,
"num_tokens": 659099116.0,
"step": 7150
},
{
"entropy": 1.18046875,
"epoch": 0.9785431187645209,
"grad_norm": 0.08576524222050283,
"learning_rate": 2.6331548541637314e-06,
"loss": 1.1893,
"mean_token_accuracy": 0.7202797114849091,
"num_tokens": 660019179.0,
"step": 7160
},
{
"entropy": 1.1328125,
"epoch": 0.979909799097991,
"grad_norm": 0.0703817417588252,
"learning_rate": 2.6296322389742146e-06,
"loss": 1.1378,
"mean_token_accuracy": 0.7290609896183013,
"num_tokens": 660951954.0,
"step": 7170
},
{
"entropy": 1.10546875,
"epoch": 0.981276479431461,
"grad_norm": 0.0752339624439464,
"learning_rate": 2.6261096237846977e-06,
"loss": 1.1082,
"mean_token_accuracy": 0.7334511637687683,
"num_tokens": 661862699.0,
"step": 7180
},
{
"entropy": 1.13984375,
"epoch": 0.982643159764931,
"grad_norm": 0.07712280890063228,
"learning_rate": 2.622587008595181e-06,
"loss": 1.1412,
"mean_token_accuracy": 0.7278154075145722,
"num_tokens": 662762440.0,
"step": 7190
},
{
"entropy": 1.16953125,
"epoch": 0.984009840098401,
"grad_norm": 0.08656066720011525,
"learning_rate": 2.6190643934056648e-06,
"loss": 1.161,
"mean_token_accuracy": 0.7224133551120758,
"num_tokens": 663695752.0,
"step": 7200
},
{
"entropy": 1.203515625,
"epoch": 0.985376520431871,
"grad_norm": 0.07828346842180274,
"learning_rate": 2.615541778216148e-06,
"loss": 1.2092,
"mean_token_accuracy": 0.7154077529907227,
"num_tokens": 664595316.0,
"step": 7210
},
{
"entropy": 1.13359375,
"epoch": 0.9867432007653409,
"grad_norm": 0.07232709609596628,
"learning_rate": 2.612019163026631e-06,
"loss": 1.136,
"mean_token_accuracy": 0.7283076047897339,
"num_tokens": 665499511.0,
"step": 7220
},
{
"entropy": 1.20390625,
"epoch": 0.988109881098811,
"grad_norm": 0.07819844262835116,
"learning_rate": 2.6084965478371146e-06,
"loss": 1.2114,
"mean_token_accuracy": 0.7144841432571412,
"num_tokens": 666424892.0,
"step": 7230
},
{
"entropy": 1.126953125,
"epoch": 0.989476561432281,
"grad_norm": 0.07219649511628821,
"learning_rate": 2.6049739326475977e-06,
"loss": 1.138,
"mean_token_accuracy": 0.7284655928611755,
"num_tokens": 667349028.0,
"step": 7240
},
{
"entropy": 1.178125,
"epoch": 0.990843241765751,
"grad_norm": 0.0791688806848296,
"learning_rate": 2.6014513174580812e-06,
"loss": 1.1688,
"mean_token_accuracy": 0.7214187681674957,
"num_tokens": 668242767.0,
"step": 7250
},
{
"entropy": 1.153125,
"epoch": 0.992209922099221,
"grad_norm": 0.11881050474668275,
"learning_rate": 2.5979287022685644e-06,
"loss": 1.164,
"mean_token_accuracy": 0.7240462362766266,
"num_tokens": 669135549.0,
"step": 7260
},
{
"entropy": 1.1203125,
"epoch": 0.993576602432691,
"grad_norm": 0.07142563399725191,
"learning_rate": 2.594406087079048e-06,
"loss": 1.1178,
"mean_token_accuracy": 0.7315175712108613,
"num_tokens": 670042351.0,
"step": 7270
},
{
"entropy": 1.17109375,
"epoch": 0.994943282766161,
"grad_norm": 0.07186032984758249,
"learning_rate": 2.590883471889531e-06,
"loss": 1.1716,
"mean_token_accuracy": 0.7213475227355957,
"num_tokens": 670950924.0,
"step": 7280
},
{
"entropy": 1.134375,
"epoch": 0.996309963099631,
"grad_norm": 0.07512110082252975,
"learning_rate": 2.587360856700014e-06,
"loss": 1.1316,
"mean_token_accuracy": 0.7284415245056153,
"num_tokens": 671855775.0,
"step": 7290
},
{
"entropy": 1.1328125,
"epoch": 0.997676643433101,
"grad_norm": 0.07165722735039291,
"learning_rate": 2.5838382415104973e-06,
"loss": 1.1288,
"mean_token_accuracy": 0.7284874141216278,
"num_tokens": 672778864.0,
"step": 7300
},
{
"entropy": 1.18671875,
"epoch": 0.999043323766571,
"grad_norm": 0.06765581916536742,
"learning_rate": 2.5803156263209813e-06,
"loss": 1.1974,
"mean_token_accuracy": 0.7151514708995819,
"num_tokens": 673675531.0,
"step": 7310
},
{
"entropy": 1.17734375,
"epoch": 1.000410004100041,
"grad_norm": 0.0737252323207834,
"learning_rate": 2.5767930111314644e-06,
"loss": 1.1788,
"mean_token_accuracy": 0.7207051038742065,
"num_tokens": 674636327.0,
"step": 7320
},
{
"entropy": 1.164453125,
"epoch": 1.001776684433511,
"grad_norm": 0.08062450695233865,
"learning_rate": 2.5732703959419475e-06,
"loss": 1.1646,
"mean_token_accuracy": 0.7225695312023163,
"num_tokens": 675525894.0,
"step": 7330
},
{
"entropy": 1.1296875,
"epoch": 1.003143364766981,
"grad_norm": 0.06403573237144083,
"learning_rate": 2.5697477807524306e-06,
"loss": 1.1384,
"mean_token_accuracy": 0.7271638453006745,
"num_tokens": 676510301.0,
"step": 7340
},
{
"entropy": 1.120703125,
"epoch": 1.004510045100451,
"grad_norm": 0.09330903136306859,
"learning_rate": 2.5662251655629138e-06,
"loss": 1.1099,
"mean_token_accuracy": 0.7332711040973663,
"num_tokens": 677432141.0,
"step": 7350
},
{
"entropy": 1.16953125,
"epoch": 1.005876725433921,
"grad_norm": 0.07343465149432463,
"learning_rate": 2.5627025503733977e-06,
"loss": 1.173,
"mean_token_accuracy": 0.7202249050140381,
"num_tokens": 678360785.0,
"step": 7360
},
{
"entropy": 1.1921875,
"epoch": 1.007243405767391,
"grad_norm": 0.26020796556434744,
"learning_rate": 2.559179935183881e-06,
"loss": 1.1846,
"mean_token_accuracy": 0.7196113526821136,
"num_tokens": 679264490.0,
"step": 7370
},
{
"entropy": 1.16484375,
"epoch": 1.0086100861008611,
"grad_norm": 0.0760246458015285,
"learning_rate": 2.555657319994364e-06,
"loss": 1.168,
"mean_token_accuracy": 0.7237367868423462,
"num_tokens": 680184449.0,
"step": 7380
},
{
"entropy": 1.12265625,
"epoch": 1.009976766434331,
"grad_norm": 0.07511808845405891,
"learning_rate": 2.552134704804847e-06,
"loss": 1.11,
"mean_token_accuracy": 0.7317633628845215,
"num_tokens": 681146324.0,
"step": 7390
},
{
"entropy": 1.19609375,
"epoch": 1.011343446767801,
"grad_norm": 0.07892746606627066,
"learning_rate": 2.5486120896153306e-06,
"loss": 1.2123,
"mean_token_accuracy": 0.7130722641944885,
"num_tokens": 682048165.0,
"step": 7400
},
{
"entropy": 1.1453125,
"epoch": 1.012710127101271,
"grad_norm": 0.0769192150627566,
"learning_rate": 2.5450894744258138e-06,
"loss": 1.1441,
"mean_token_accuracy": 0.7279314994812012,
"num_tokens": 682971836.0,
"step": 7410
},
{
"entropy": 1.1140625,
"epoch": 1.014076807434741,
"grad_norm": 0.07741063549951126,
"learning_rate": 2.5415668592362973e-06,
"loss": 1.1098,
"mean_token_accuracy": 0.7347835302352905,
"num_tokens": 683851000.0,
"step": 7420
},
{
"entropy": 1.12421875,
"epoch": 1.015443487768211,
"grad_norm": 0.07076997925305578,
"learning_rate": 2.5380442440467804e-06,
"loss": 1.1256,
"mean_token_accuracy": 0.7318951189517975,
"num_tokens": 684791401.0,
"step": 7430
},
{
"entropy": 1.203125,
"epoch": 1.016810168101681,
"grad_norm": 0.083182842448712,
"learning_rate": 2.534521628857264e-06,
"loss": 1.2093,
"mean_token_accuracy": 0.7143947660923005,
"num_tokens": 685705685.0,
"step": 7440
},
{
"entropy": 1.15078125,
"epoch": 1.018176848435151,
"grad_norm": 0.08262242878762252,
"learning_rate": 2.530999013667747e-06,
"loss": 1.147,
"mean_token_accuracy": 0.7274811923503876,
"num_tokens": 686605303.0,
"step": 7450
},
{
"entropy": 1.234375,
"epoch": 1.019543528768621,
"grad_norm": 0.07832607508362774,
"learning_rate": 2.5274763984782302e-06,
"loss": 1.234,
"mean_token_accuracy": 0.7120592892169952,
"num_tokens": 687526237.0,
"step": 7460
},
{
"entropy": 1.100390625,
"epoch": 1.020910209102091,
"grad_norm": 0.06908402850925925,
"learning_rate": 2.5239537832887138e-06,
"loss": 1.1094,
"mean_token_accuracy": 0.7344645440578461,
"num_tokens": 688486449.0,
"step": 7470
},
{
"entropy": 1.15078125,
"epoch": 1.022276889435561,
"grad_norm": 0.07293553396908793,
"learning_rate": 2.5204311680991973e-06,
"loss": 1.15,
"mean_token_accuracy": 0.7241282165050507,
"num_tokens": 689414922.0,
"step": 7480
},
{
"entropy": 1.11953125,
"epoch": 1.023643569769031,
"grad_norm": 0.0725638693603345,
"learning_rate": 2.5169085529096804e-06,
"loss": 1.1232,
"mean_token_accuracy": 0.7312600493431092,
"num_tokens": 690270353.0,
"step": 7490
},
{
"entropy": 1.1640625,
"epoch": 1.0250102501025011,
"grad_norm": 0.07587030552040976,
"learning_rate": 2.5133859377201636e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7218397200107575,
"num_tokens": 691186114.0,
"step": 7500
},
{
"entropy": 1.19453125,
"epoch": 1.0263769304359711,
"grad_norm": 0.07306313288566894,
"learning_rate": 2.5098633225306467e-06,
"loss": 1.1933,
"mean_token_accuracy": 0.7181048035621643,
"num_tokens": 692117025.0,
"step": 7510
},
{
"entropy": 1.1375,
"epoch": 1.027743610769441,
"grad_norm": 0.07760896434504456,
"learning_rate": 2.50634070734113e-06,
"loss": 1.1386,
"mean_token_accuracy": 0.7283733010292053,
"num_tokens": 693050611.0,
"step": 7520
},
{
"entropy": 1.14375,
"epoch": 1.029110291102911,
"grad_norm": 0.06928090778735926,
"learning_rate": 2.502818092151614e-06,
"loss": 1.1449,
"mean_token_accuracy": 0.7273295640945434,
"num_tokens": 693952358.0,
"step": 7530
},
{
"entropy": 1.13671875,
"epoch": 1.030476971436381,
"grad_norm": 0.07705847567527405,
"learning_rate": 2.499295476962097e-06,
"loss": 1.123,
"mean_token_accuracy": 0.7290900886058808,
"num_tokens": 694878753.0,
"step": 7540
},
{
"entropy": 1.140625,
"epoch": 1.031843651769851,
"grad_norm": 0.07390645950470319,
"learning_rate": 2.49577286177258e-06,
"loss": 1.1486,
"mean_token_accuracy": 0.7276991546154022,
"num_tokens": 695831733.0,
"step": 7550
},
{
"entropy": 1.16640625,
"epoch": 1.033210332103321,
"grad_norm": 0.07750752986994666,
"learning_rate": 2.492250246583063e-06,
"loss": 1.1663,
"mean_token_accuracy": 0.7234357297420502,
"num_tokens": 696739857.0,
"step": 7560
},
{
"entropy": 1.13984375,
"epoch": 1.034577012436791,
"grad_norm": 0.07334788934683574,
"learning_rate": 2.4887276313935467e-06,
"loss": 1.1351,
"mean_token_accuracy": 0.728349757194519,
"num_tokens": 697643679.0,
"step": 7570
},
{
"entropy": 1.1265625,
"epoch": 1.035943692770261,
"grad_norm": 0.06683788837024443,
"learning_rate": 2.48520501620403e-06,
"loss": 1.1302,
"mean_token_accuracy": 0.7296790242195129,
"num_tokens": 698587437.0,
"step": 7580
},
{
"entropy": 1.0984375,
"epoch": 1.037310373103731,
"grad_norm": 0.06994109929123302,
"learning_rate": 2.4816824010145134e-06,
"loss": 1.1027,
"mean_token_accuracy": 0.7324663281440735,
"num_tokens": 699490277.0,
"step": 7590
},
{
"entropy": 1.1421875,
"epoch": 1.038677053437201,
"grad_norm": 0.0720139758486299,
"learning_rate": 2.478159785824997e-06,
"loss": 1.1417,
"mean_token_accuracy": 0.7271109819412231,
"num_tokens": 700469765.0,
"step": 7600
},
{
"entropy": 1.16015625,
"epoch": 1.040043733770671,
"grad_norm": 0.07808117525971063,
"learning_rate": 2.47463717063548e-06,
"loss": 1.1698,
"mean_token_accuracy": 0.7229463934898377,
"num_tokens": 701399673.0,
"step": 7610
},
{
"entropy": 1.1796875,
"epoch": 1.0414104141041411,
"grad_norm": 0.0738578109492815,
"learning_rate": 2.4711145554459636e-06,
"loss": 1.1829,
"mean_token_accuracy": 0.7196498274803161,
"num_tokens": 702352041.0,
"step": 7620
},
{
"entropy": 1.17734375,
"epoch": 1.0427770944376111,
"grad_norm": 0.06843560539691947,
"learning_rate": 2.4675919402564467e-06,
"loss": 1.1729,
"mean_token_accuracy": 0.721290385723114,
"num_tokens": 703273557.0,
"step": 7630
},
{
"entropy": 1.15546875,
"epoch": 1.0441437747710811,
"grad_norm": 0.07186953177742417,
"learning_rate": 2.46406932506693e-06,
"loss": 1.1547,
"mean_token_accuracy": 0.724176424741745,
"num_tokens": 704228082.0,
"step": 7640
},
{
"entropy": 1.14296875,
"epoch": 1.045510455104551,
"grad_norm": 0.0806035490541247,
"learning_rate": 2.4605467098774134e-06,
"loss": 1.1454,
"mean_token_accuracy": 0.7277532815933228,
"num_tokens": 705192144.0,
"step": 7650
},
{
"entropy": 1.1171875,
"epoch": 1.046877135438021,
"grad_norm": 0.07264467317879364,
"learning_rate": 2.4570240946878965e-06,
"loss": 1.1204,
"mean_token_accuracy": 0.7304138779640198,
"num_tokens": 706106625.0,
"step": 7660
},
{
"entropy": 1.14921875,
"epoch": 1.048243815771491,
"grad_norm": 0.07405573551440274,
"learning_rate": 2.4535014794983796e-06,
"loss": 1.1601,
"mean_token_accuracy": 0.723319411277771,
"num_tokens": 707021496.0,
"step": 7670
},
{
"entropy": 1.1296875,
"epoch": 1.049610496104961,
"grad_norm": 0.08044694972472169,
"learning_rate": 2.449978864308863e-06,
"loss": 1.1253,
"mean_token_accuracy": 0.7317678928375244,
"num_tokens": 707906178.0,
"step": 7680
},
{
"entropy": 1.14765625,
"epoch": 1.050977176438431,
"grad_norm": 0.07844821530125186,
"learning_rate": 2.4464562491193463e-06,
"loss": 1.1415,
"mean_token_accuracy": 0.727269846200943,
"num_tokens": 708801138.0,
"step": 7690
},
{
"entropy": 1.18125,
"epoch": 1.052343856771901,
"grad_norm": 0.08667410990887943,
"learning_rate": 2.4429336339298294e-06,
"loss": 1.2024,
"mean_token_accuracy": 0.7173227488994598,
"num_tokens": 709746763.0,
"step": 7700
},
{
"entropy": 1.084765625,
"epoch": 1.053710537105371,
"grad_norm": 0.11651811350000181,
"learning_rate": 2.439411018740313e-06,
"loss": 1.0837,
"mean_token_accuracy": 0.7364135384559631,
"num_tokens": 710631483.0,
"step": 7710
},
{
"entropy": 1.11171875,
"epoch": 1.055077217438841,
"grad_norm": 0.06849604508224683,
"learning_rate": 2.435888403550796e-06,
"loss": 1.1061,
"mean_token_accuracy": 0.7319375813007355,
"num_tokens": 711528540.0,
"step": 7720
},
{
"entropy": 1.1546875,
"epoch": 1.056443897772311,
"grad_norm": 0.07639506544915134,
"learning_rate": 2.4323657883612797e-06,
"loss": 1.169,
"mean_token_accuracy": 0.7233155548572541,
"num_tokens": 712460832.0,
"step": 7730
},
{
"entropy": 1.16875,
"epoch": 1.0578105781057812,
"grad_norm": 0.07662580773474521,
"learning_rate": 2.4288431731717628e-06,
"loss": 1.1737,
"mean_token_accuracy": 0.7220523357391357,
"num_tokens": 713350708.0,
"step": 7740
},
{
"entropy": 1.13828125,
"epoch": 1.0591772584392511,
"grad_norm": 0.08110810285835936,
"learning_rate": 2.4253205579822463e-06,
"loss": 1.1427,
"mean_token_accuracy": 0.7258868992328644,
"num_tokens": 714247281.0,
"step": 7750
},
{
"entropy": 1.14140625,
"epoch": 1.0605439387727211,
"grad_norm": 0.07886152304217262,
"learning_rate": 2.4217979427927294e-06,
"loss": 1.1335,
"mean_token_accuracy": 0.7291694462299347,
"num_tokens": 715138854.0,
"step": 7760
},
{
"entropy": 1.1078125,
"epoch": 1.0619106191061911,
"grad_norm": 0.06959952521732587,
"learning_rate": 2.418275327603213e-06,
"loss": 1.1068,
"mean_token_accuracy": 0.734289026260376,
"num_tokens": 716042245.0,
"step": 7770
},
{
"entropy": 1.129296875,
"epoch": 1.063277299439661,
"grad_norm": 0.08232395696155809,
"learning_rate": 2.414752712413696e-06,
"loss": 1.1181,
"mean_token_accuracy": 0.7306691646575928,
"num_tokens": 716962640.0,
"step": 7780
},
{
"entropy": 1.11796875,
"epoch": 1.064643979773131,
"grad_norm": 0.07134934414717024,
"learning_rate": 2.4112300972241797e-06,
"loss": 1.1176,
"mean_token_accuracy": 0.732305234670639,
"num_tokens": 717870625.0,
"step": 7790
},
{
"entropy": 1.15078125,
"epoch": 1.066010660106601,
"grad_norm": 0.07165648706542407,
"learning_rate": 2.4077074820346628e-06,
"loss": 1.157,
"mean_token_accuracy": 0.7240320324897767,
"num_tokens": 718799421.0,
"step": 7800
},
{
"entropy": 1.17421875,
"epoch": 1.067377340440071,
"grad_norm": 0.07824344190339863,
"learning_rate": 2.404184866845146e-06,
"loss": 1.1788,
"mean_token_accuracy": 0.7195448756217957,
"num_tokens": 719746616.0,
"step": 7810
},
{
"entropy": 1.15625,
"epoch": 1.068744020773541,
"grad_norm": 0.08836751986859917,
"learning_rate": 2.4006622516556295e-06,
"loss": 1.1519,
"mean_token_accuracy": 0.7249668836593628,
"num_tokens": 720672544.0,
"step": 7820
},
{
"entropy": 1.1734375,
"epoch": 1.070110701107011,
"grad_norm": 0.08072215620466823,
"learning_rate": 2.3971396364661126e-06,
"loss": 1.174,
"mean_token_accuracy": 0.7209043681621552,
"num_tokens": 721583384.0,
"step": 7830
},
{
"entropy": 1.1765625,
"epoch": 1.071477381440481,
"grad_norm": 0.07464518814237596,
"learning_rate": 2.393617021276596e-06,
"loss": 1.1768,
"mean_token_accuracy": 0.7218499720096588,
"num_tokens": 722493561.0,
"step": 7840
},
{
"entropy": 1.15703125,
"epoch": 1.072844061773951,
"grad_norm": 0.073525270940103,
"learning_rate": 2.3900944060870793e-06,
"loss": 1.1693,
"mean_token_accuracy": 0.7235225141048431,
"num_tokens": 723384528.0,
"step": 7850
},
{
"entropy": 1.161328125,
"epoch": 1.0742107421074212,
"grad_norm": 0.12820591347505253,
"learning_rate": 2.3865717908975624e-06,
"loss": 1.1546,
"mean_token_accuracy": 0.7244865536689759,
"num_tokens": 724319167.0,
"step": 7860
},
{
"entropy": 1.172265625,
"epoch": 1.0755774224408912,
"grad_norm": 0.07555851588034271,
"learning_rate": 2.383049175708046e-06,
"loss": 1.1717,
"mean_token_accuracy": 0.7218300342559815,
"num_tokens": 725236882.0,
"step": 7870
},
{
"entropy": 1.1265625,
"epoch": 1.0769441027743611,
"grad_norm": 0.08316829263209702,
"learning_rate": 2.379526560518529e-06,
"loss": 1.1319,
"mean_token_accuracy": 0.7281768321990967,
"num_tokens": 726143276.0,
"step": 7880
},
{
"entropy": 1.1921875,
"epoch": 1.0783107831078311,
"grad_norm": 0.09822269481300443,
"learning_rate": 2.376003945329012e-06,
"loss": 1.1852,
"mean_token_accuracy": 0.7175304174423218,
"num_tokens": 727083219.0,
"step": 7890
},
{
"entropy": 1.09765625,
"epoch": 1.079677463441301,
"grad_norm": 0.07806797922728863,
"learning_rate": 2.3724813301394957e-06,
"loss": 1.1011,
"mean_token_accuracy": 0.7350208938121796,
"num_tokens": 727975162.0,
"step": 7900
},
{
"entropy": 1.12109375,
"epoch": 1.081044143774771,
"grad_norm": 0.08906757002325898,
"learning_rate": 2.368958714949979e-06,
"loss": 1.1188,
"mean_token_accuracy": 0.7319762229919433,
"num_tokens": 728859542.0,
"step": 7910
},
{
"entropy": 1.1421875,
"epoch": 1.082410824108241,
"grad_norm": 0.07780311192901886,
"learning_rate": 2.3654360997604624e-06,
"loss": 1.1295,
"mean_token_accuracy": 0.7288353145122528,
"num_tokens": 729761593.0,
"step": 7920
},
{
"entropy": 1.16015625,
"epoch": 1.083777504441711,
"grad_norm": 0.07323977699208546,
"learning_rate": 2.3619134845709455e-06,
"loss": 1.1667,
"mean_token_accuracy": 0.7241620421409607,
"num_tokens": 730709039.0,
"step": 7930
},
{
"entropy": 1.14609375,
"epoch": 1.085144184775181,
"grad_norm": 0.06926288231617071,
"learning_rate": 2.358390869381429e-06,
"loss": 1.139,
"mean_token_accuracy": 0.728891009092331,
"num_tokens": 731624659.0,
"step": 7940
},
{
"entropy": 1.184375,
"epoch": 1.086510865108651,
"grad_norm": 0.07974319502122963,
"learning_rate": 2.354868254191912e-06,
"loss": 1.1984,
"mean_token_accuracy": 0.717540156841278,
"num_tokens": 732566122.0,
"step": 7950
},
{
"entropy": 1.15546875,
"epoch": 1.087877545442121,
"grad_norm": 0.08205649312915086,
"learning_rate": 2.3513456390023957e-06,
"loss": 1.1605,
"mean_token_accuracy": 0.7247389853000641,
"num_tokens": 733499588.0,
"step": 7960
},
{
"entropy": 1.18046875,
"epoch": 1.089244225775591,
"grad_norm": 0.07548493758508965,
"learning_rate": 2.347823023812879e-06,
"loss": 1.1857,
"mean_token_accuracy": 0.7190035581588745,
"num_tokens": 734386011.0,
"step": 7970
},
{
"entropy": 1.18515625,
"epoch": 1.090610906109061,
"grad_norm": 0.07276364850927027,
"learning_rate": 2.3443004086233624e-06,
"loss": 1.1837,
"mean_token_accuracy": 0.7195825278759003,
"num_tokens": 735290799.0,
"step": 7980
},
{
"entropy": 1.12421875,
"epoch": 1.0919775864425312,
"grad_norm": 0.07413756283242254,
"learning_rate": 2.3407777934338455e-06,
"loss": 1.1286,
"mean_token_accuracy": 0.7311835706233978,
"num_tokens": 736222579.0,
"step": 7990
},
{
"entropy": 1.135546875,
"epoch": 1.0933442667760012,
"grad_norm": 0.0748903532898809,
"learning_rate": 2.3372551782443286e-06,
"loss": 1.1331,
"mean_token_accuracy": 0.7291298925876617,
"num_tokens": 737134733.0,
"step": 8000
},
{
"entropy": 1.15703125,
"epoch": 1.0947109471094711,
"grad_norm": 0.06671487209381562,
"learning_rate": 2.333732563054812e-06,
"loss": 1.1611,
"mean_token_accuracy": 0.7236049234867096,
"num_tokens": 738079862.0,
"step": 8010
},
{
"entropy": 1.158984375,
"epoch": 1.0960776274429411,
"grad_norm": 0.07428497712108813,
"learning_rate": 2.3302099478652953e-06,
"loss": 1.1617,
"mean_token_accuracy": 0.7234541535377502,
"num_tokens": 739009539.0,
"step": 8020
},
{
"entropy": 1.15625,
"epoch": 1.097444307776411,
"grad_norm": 0.07845626452691976,
"learning_rate": 2.3266873326757784e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.7230498313903808,
"num_tokens": 739893426.0,
"step": 8030
},
{
"entropy": 1.21796875,
"epoch": 1.098810988109881,
"grad_norm": 0.07693706342071038,
"learning_rate": 2.323164717486262e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.7093816041946411,
"num_tokens": 740840806.0,
"step": 8040
},
{
"entropy": 1.078125,
"epoch": 1.100177668443351,
"grad_norm": 0.07051539464841743,
"learning_rate": 2.319642102296745e-06,
"loss": 1.0838,
"mean_token_accuracy": 0.7390916228294373,
"num_tokens": 741766338.0,
"step": 8050
},
{
"entropy": 1.14921875,
"epoch": 1.101544348776821,
"grad_norm": 0.0700198260807008,
"learning_rate": 2.3161194871072287e-06,
"loss": 1.15,
"mean_token_accuracy": 0.7243112623691559,
"num_tokens": 742676539.0,
"step": 8060
},
{
"entropy": 1.1265625,
"epoch": 1.102911029110291,
"grad_norm": 0.06892087242600473,
"learning_rate": 2.3125968719177118e-06,
"loss": 1.1248,
"mean_token_accuracy": 0.7307648420333862,
"num_tokens": 743576261.0,
"step": 8070
},
{
"entropy": 1.19609375,
"epoch": 1.104277709443761,
"grad_norm": 0.07512326910324399,
"learning_rate": 2.3090742567281953e-06,
"loss": 1.1846,
"mean_token_accuracy": 0.7205294787883758,
"num_tokens": 744533015.0,
"step": 8080
},
{
"entropy": 1.13359375,
"epoch": 1.105644389777231,
"grad_norm": 0.07209817267981773,
"learning_rate": 2.3055516415386785e-06,
"loss": 1.1373,
"mean_token_accuracy": 0.7290755987167359,
"num_tokens": 745439539.0,
"step": 8090
},
{
"entropy": 1.15390625,
"epoch": 1.1070110701107012,
"grad_norm": 0.09106654509548787,
"learning_rate": 2.302029026349162e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.7245008468627929,
"num_tokens": 746344325.0,
"step": 8100
},
{
"entropy": 1.16171875,
"epoch": 1.1083777504441712,
"grad_norm": 0.08041291554928763,
"learning_rate": 2.298506411159645e-06,
"loss": 1.1674,
"mean_token_accuracy": 0.7222560226917267,
"num_tokens": 747287311.0,
"step": 8110
},
{
"entropy": 1.1359375,
"epoch": 1.1097444307776412,
"grad_norm": 0.0671539913361199,
"learning_rate": 2.2949837959701287e-06,
"loss": 1.1298,
"mean_token_accuracy": 0.7289430797100067,
"num_tokens": 748278211.0,
"step": 8120
},
{
"entropy": 1.17265625,
"epoch": 1.1111111111111112,
"grad_norm": 0.07733627687732952,
"learning_rate": 2.291461180780612e-06,
"loss": 1.1724,
"mean_token_accuracy": 0.7217030942440033,
"num_tokens": 749232277.0,
"step": 8130
},
{
"entropy": 1.15859375,
"epoch": 1.1124777914445811,
"grad_norm": 0.07412143645855149,
"learning_rate": 2.287938565591095e-06,
"loss": 1.1596,
"mean_token_accuracy": 0.7237881600856781,
"num_tokens": 750157024.0,
"step": 8140
},
{
"entropy": 1.1125,
"epoch": 1.1138444717780511,
"grad_norm": 0.0720996156326291,
"learning_rate": 2.2844159504015785e-06,
"loss": 1.1194,
"mean_token_accuracy": 0.7321566998958587,
"num_tokens": 751102311.0,
"step": 8150
},
{
"entropy": 1.148046875,
"epoch": 1.115211152111521,
"grad_norm": 0.08084776516433385,
"learning_rate": 2.2808933352120616e-06,
"loss": 1.1437,
"mean_token_accuracy": 0.7267088472843171,
"num_tokens": 751970935.0,
"step": 8160
},
{
"entropy": 1.1453125,
"epoch": 1.116577832444991,
"grad_norm": 0.07116354568512419,
"learning_rate": 2.277370720022545e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.7271084308624267,
"num_tokens": 752883996.0,
"step": 8170
},
{
"entropy": 1.14765625,
"epoch": 1.117944512778461,
"grad_norm": 0.07421942900529195,
"learning_rate": 2.2738481048330283e-06,
"loss": 1.1425,
"mean_token_accuracy": 0.7260145425796509,
"num_tokens": 753824067.0,
"step": 8180
},
{
"entropy": 1.140234375,
"epoch": 1.119311193111931,
"grad_norm": 0.07457015370507009,
"learning_rate": 2.2703254896435114e-06,
"loss": 1.1523,
"mean_token_accuracy": 0.7251345932483673,
"num_tokens": 754780634.0,
"step": 8190
},
{
"entropy": 1.175,
"epoch": 1.120677873445401,
"grad_norm": 0.06690939256213552,
"learning_rate": 2.266802874453995e-06,
"loss": 1.1843,
"mean_token_accuracy": 0.7191133141517639,
"num_tokens": 755718040.0,
"step": 8200
},
{
"entropy": 1.159375,
"epoch": 1.122044553778871,
"grad_norm": 0.0806109429348856,
"learning_rate": 2.263280259264478e-06,
"loss": 1.1546,
"mean_token_accuracy": 0.7256765007972718,
"num_tokens": 756620588.0,
"step": 8210
},
{
"entropy": 1.1140625,
"epoch": 1.123411234112341,
"grad_norm": 0.07359715683218891,
"learning_rate": 2.259757644074961e-06,
"loss": 1.1047,
"mean_token_accuracy": 0.7341251850128174,
"num_tokens": 757521919.0,
"step": 8220
},
{
"entropy": 1.159375,
"epoch": 1.1247779144458112,
"grad_norm": 0.08212548026596359,
"learning_rate": 2.2562350288854447e-06,
"loss": 1.1535,
"mean_token_accuracy": 0.725735342502594,
"num_tokens": 758445977.0,
"step": 8230
},
{
"entropy": 1.125,
"epoch": 1.1261445947792812,
"grad_norm": 0.07626577886961808,
"learning_rate": 2.252712413695928e-06,
"loss": 1.1183,
"mean_token_accuracy": 0.732667726278305,
"num_tokens": 759331953.0,
"step": 8240
},
{
"entropy": 1.1453125,
"epoch": 1.1275112751127512,
"grad_norm": 0.07340238900533853,
"learning_rate": 2.2491897985064114e-06,
"loss": 1.1485,
"mean_token_accuracy": 0.725453668832779,
"num_tokens": 760273968.0,
"step": 8250
},
{
"entropy": 1.13125,
"epoch": 1.1288779554462212,
"grad_norm": 0.07345693115597439,
"learning_rate": 2.2456671833168945e-06,
"loss": 1.1354,
"mean_token_accuracy": 0.7275841295719147,
"num_tokens": 761159352.0,
"step": 8260
},
{
"entropy": 1.14921875,
"epoch": 1.1302446357796911,
"grad_norm": 0.07479600475836908,
"learning_rate": 2.242144568127378e-06,
"loss": 1.144,
"mean_token_accuracy": 0.72788867354393,
"num_tokens": 762051558.0,
"step": 8270
},
{
"entropy": 1.1734375,
"epoch": 1.1316113161131611,
"grad_norm": 0.07011095565070748,
"learning_rate": 2.238621952937861e-06,
"loss": 1.1839,
"mean_token_accuracy": 0.7203634679317474,
"num_tokens": 763006454.0,
"step": 8280
},
{
"entropy": 1.19296875,
"epoch": 1.132977996446631,
"grad_norm": 0.07742039709204987,
"learning_rate": 2.2350993377483447e-06,
"loss": 1.1889,
"mean_token_accuracy": 0.7187819421291352,
"num_tokens": 763919479.0,
"step": 8290
},
{
"entropy": 1.1515625,
"epoch": 1.134344676780101,
"grad_norm": 0.07683583522725311,
"learning_rate": 2.231576722558828e-06,
"loss": 1.1553,
"mean_token_accuracy": 0.7243268013000488,
"num_tokens": 764823373.0,
"step": 8300
},
{
"entropy": 1.128125,
"epoch": 1.135711357113571,
"grad_norm": 0.07600187476387855,
"learning_rate": 2.2280541073693114e-06,
"loss": 1.1313,
"mean_token_accuracy": 0.7298314929008484,
"num_tokens": 765754606.0,
"step": 8310
},
{
"entropy": 1.1984375,
"epoch": 1.137078037447041,
"grad_norm": 0.0765675139221844,
"learning_rate": 2.2245314921797945e-06,
"loss": 1.2056,
"mean_token_accuracy": 0.7164188206195832,
"num_tokens": 766685236.0,
"step": 8320
},
{
"entropy": 1.1671875,
"epoch": 1.1384447177805113,
"grad_norm": 0.07247268747498811,
"learning_rate": 2.2210088769902777e-06,
"loss": 1.1708,
"mean_token_accuracy": 0.7230432331562042,
"num_tokens": 767625646.0,
"step": 8330
},
{
"entropy": 1.05390625,
"epoch": 1.1398113981139812,
"grad_norm": 0.07619311277159813,
"learning_rate": 2.217486261800761e-06,
"loss": 1.0589,
"mean_token_accuracy": 0.7411799252033233,
"num_tokens": 768490930.0,
"step": 8340
},
{
"entropy": 1.18203125,
"epoch": 1.1411780784474512,
"grad_norm": 0.07607516782311423,
"learning_rate": 2.2139636466112443e-06,
"loss": 1.1872,
"mean_token_accuracy": 0.7201533794403077,
"num_tokens": 769444909.0,
"step": 8350
},
{
"entropy": 1.11640625,
"epoch": 1.1425447587809212,
"grad_norm": 0.07049086164948205,
"learning_rate": 2.2104410314217275e-06,
"loss": 1.1184,
"mean_token_accuracy": 0.7323162019252777,
"num_tokens": 770354512.0,
"step": 8360
},
{
"entropy": 1.16796875,
"epoch": 1.1439114391143912,
"grad_norm": 0.07562117464677191,
"learning_rate": 2.206918416232211e-06,
"loss": 1.1708,
"mean_token_accuracy": 0.7222756624221802,
"num_tokens": 771302117.0,
"step": 8370
},
{
"entropy": 1.1703125,
"epoch": 1.1452781194478612,
"grad_norm": 0.07182738695085673,
"learning_rate": 2.203395801042694e-06,
"loss": 1.1687,
"mean_token_accuracy": 0.7207498490810395,
"num_tokens": 772242430.0,
"step": 8380
},
{
"entropy": 1.15625,
"epoch": 1.1466447997813312,
"grad_norm": 0.07701804052472745,
"learning_rate": 2.1998731858531777e-06,
"loss": 1.1655,
"mean_token_accuracy": 0.722652930021286,
"num_tokens": 773212126.0,
"step": 8390
},
{
"entropy": 1.166796875,
"epoch": 1.1480114801148011,
"grad_norm": 0.07677281853337992,
"learning_rate": 2.196350570663661e-06,
"loss": 1.172,
"mean_token_accuracy": 0.7225108087062836,
"num_tokens": 774140401.0,
"step": 8400
},
{
"entropy": 1.137890625,
"epoch": 1.1493781604482711,
"grad_norm": 0.06482175305380408,
"learning_rate": 2.192827955474144e-06,
"loss": 1.1299,
"mean_token_accuracy": 0.7296415746212006,
"num_tokens": 775040095.0,
"step": 8410
},
{
"entropy": 1.10546875,
"epoch": 1.150744840781741,
"grad_norm": 0.07044084529803732,
"learning_rate": 2.1893053402846275e-06,
"loss": 1.1091,
"mean_token_accuracy": 0.7352366030216217,
"num_tokens": 775963582.0,
"step": 8420
},
{
"entropy": 1.20390625,
"epoch": 1.152111521115211,
"grad_norm": 0.07288978105545031,
"learning_rate": 2.1857827250951106e-06,
"loss": 1.2035,
"mean_token_accuracy": 0.7188331723213196,
"num_tokens": 776913274.0,
"step": 8430
},
{
"entropy": 1.16875,
"epoch": 1.153478201448681,
"grad_norm": 0.07877013925839645,
"learning_rate": 2.182260109905594e-06,
"loss": 1.1697,
"mean_token_accuracy": 0.7206570148468018,
"num_tokens": 777837642.0,
"step": 8440
},
{
"entropy": 1.11015625,
"epoch": 1.154844881782151,
"grad_norm": 0.07753214450738269,
"learning_rate": 2.1787374947160777e-06,
"loss": 1.1081,
"mean_token_accuracy": 0.7332981526851654,
"num_tokens": 778735236.0,
"step": 8450
},
{
"entropy": 1.11171875,
"epoch": 1.156211562115621,
"grad_norm": 0.06821514199351059,
"learning_rate": 2.175214879526561e-06,
"loss": 1.1123,
"mean_token_accuracy": 0.7327512741088867,
"num_tokens": 779695113.0,
"step": 8460
},
{
"entropy": 1.16171875,
"epoch": 1.1575782424490912,
"grad_norm": 0.07138868308952065,
"learning_rate": 2.171692264337044e-06,
"loss": 1.1515,
"mean_token_accuracy": 0.7264484286308288,
"num_tokens": 780598891.0,
"step": 8470
},
{
"entropy": 1.13984375,
"epoch": 1.1589449227825612,
"grad_norm": 0.12301799023317873,
"learning_rate": 2.1681696491475275e-06,
"loss": 1.1272,
"mean_token_accuracy": 0.7303335785865783,
"num_tokens": 781494278.0,
"step": 8480
},
{
"entropy": 1.1484375,
"epoch": 1.1603116031160312,
"grad_norm": 0.06923083753794987,
"learning_rate": 2.1646470339580106e-06,
"loss": 1.1391,
"mean_token_accuracy": 0.7287551820278168,
"num_tokens": 782471652.0,
"step": 8490
},
{
"entropy": 1.1296875,
"epoch": 1.1616782834495012,
"grad_norm": 0.07303437337055124,
"learning_rate": 2.1611244187684937e-06,
"loss": 1.1336,
"mean_token_accuracy": 0.7310151040554047,
"num_tokens": 783377485.0,
"step": 8500
},
{
"entropy": 1.13984375,
"epoch": 1.1630449637829712,
"grad_norm": 0.06950310619237385,
"learning_rate": 2.1576018035789773e-06,
"loss": 1.1401,
"mean_token_accuracy": 0.7295718014240264,
"num_tokens": 784337833.0,
"step": 8510
},
{
"entropy": 1.103515625,
"epoch": 1.1644116441164412,
"grad_norm": 0.0767368103344973,
"learning_rate": 2.1540791883894604e-06,
"loss": 1.1005,
"mean_token_accuracy": 0.734194540977478,
"num_tokens": 785204537.0,
"step": 8520
},
{
"entropy": 1.1609375,
"epoch": 1.1657783244499111,
"grad_norm": 0.06853416838965382,
"learning_rate": 2.150556573199944e-06,
"loss": 1.1697,
"mean_token_accuracy": 0.724907112121582,
"num_tokens": 786110258.0,
"step": 8530
},
{
"entropy": 1.12578125,
"epoch": 1.1671450047833811,
"grad_norm": 0.07019108099163454,
"learning_rate": 2.147033958010427e-06,
"loss": 1.1229,
"mean_token_accuracy": 0.7310481786727905,
"num_tokens": 787025727.0,
"step": 8540
},
{
"entropy": 1.1640625,
"epoch": 1.168511685116851,
"grad_norm": 0.07840793054786563,
"learning_rate": 2.14351134282091e-06,
"loss": 1.159,
"mean_token_accuracy": 0.7262536764144898,
"num_tokens": 787966153.0,
"step": 8550
},
{
"entropy": 1.12265625,
"epoch": 1.169878365450321,
"grad_norm": 0.07290579987469484,
"learning_rate": 2.1399887276313937e-06,
"loss": 1.1025,
"mean_token_accuracy": 0.7351586222648621,
"num_tokens": 788911753.0,
"step": 8560
},
{
"entropy": 1.17265625,
"epoch": 1.1712450457837913,
"grad_norm": 0.07158435860379113,
"learning_rate": 2.136466112441877e-06,
"loss": 1.1714,
"mean_token_accuracy": 0.7222486734390259,
"num_tokens": 789851402.0,
"step": 8570
},
{
"entropy": 1.12734375,
"epoch": 1.1726117261172613,
"grad_norm": 0.07288008654530807,
"learning_rate": 2.1329434972523604e-06,
"loss": 1.1277,
"mean_token_accuracy": 0.7305023550987244,
"num_tokens": 790766294.0,
"step": 8580
},
{
"entropy": 1.136328125,
"epoch": 1.1739784064507313,
"grad_norm": 0.07564368553817735,
"learning_rate": 2.1294208820628435e-06,
"loss": 1.126,
"mean_token_accuracy": 0.7301140666007996,
"num_tokens": 791663847.0,
"step": 8590
},
{
"entropy": 1.190625,
"epoch": 1.1753450867842012,
"grad_norm": 0.07545601225204356,
"learning_rate": 2.125898266873327e-06,
"loss": 1.1911,
"mean_token_accuracy": 0.7169671177864074,
"num_tokens": 792644026.0,
"step": 8600
},
{
"entropy": 1.18359375,
"epoch": 1.1767117671176712,
"grad_norm": 0.07092306924051037,
"learning_rate": 2.12237565168381e-06,
"loss": 1.1742,
"mean_token_accuracy": 0.7207931280136108,
"num_tokens": 793573004.0,
"step": 8610
},
{
"entropy": 1.165625,
"epoch": 1.1780784474511412,
"grad_norm": 0.0706841444159185,
"learning_rate": 2.1188530364942938e-06,
"loss": 1.1692,
"mean_token_accuracy": 0.7220779120922088,
"num_tokens": 794498385.0,
"step": 8620
},
{
"entropy": 1.109375,
"epoch": 1.1794451277846112,
"grad_norm": 0.06570440800098687,
"learning_rate": 2.115330421304777e-06,
"loss": 1.1092,
"mean_token_accuracy": 0.7318012297153473,
"num_tokens": 795410342.0,
"step": 8630
},
{
"entropy": 1.1203125,
"epoch": 1.1808118081180812,
"grad_norm": 0.07259147924291136,
"learning_rate": 2.1118078061152604e-06,
"loss": 1.1382,
"mean_token_accuracy": 0.7286916494369506,
"num_tokens": 796377123.0,
"step": 8640
},
{
"entropy": 1.1109375,
"epoch": 1.1821784884515512,
"grad_norm": 0.07101254355211642,
"learning_rate": 2.1082851909257435e-06,
"loss": 1.117,
"mean_token_accuracy": 0.7316508412361145,
"num_tokens": 797310568.0,
"step": 8650
},
{
"entropy": 1.140234375,
"epoch": 1.1835451687850211,
"grad_norm": 0.06731776196499008,
"learning_rate": 2.1047625757362267e-06,
"loss": 1.1448,
"mean_token_accuracy": 0.727871423959732,
"num_tokens": 798259517.0,
"step": 8660
},
{
"entropy": 1.1265625,
"epoch": 1.1849118491184911,
"grad_norm": 0.07971773137316539,
"learning_rate": 2.1012399605467102e-06,
"loss": 1.1396,
"mean_token_accuracy": 0.7273545384407043,
"num_tokens": 799148339.0,
"step": 8670
},
{
"entropy": 1.17109375,
"epoch": 1.186278529451961,
"grad_norm": 0.06983664771631375,
"learning_rate": 2.0977173453571933e-06,
"loss": 1.1789,
"mean_token_accuracy": 0.7210070610046386,
"num_tokens": 800030657.0,
"step": 8680
},
{
"entropy": 1.19453125,
"epoch": 1.187645209785431,
"grad_norm": 0.08209809603115345,
"learning_rate": 2.0941947301676765e-06,
"loss": 1.2031,
"mean_token_accuracy": 0.7169331848621369,
"num_tokens": 800903352.0,
"step": 8690
},
{
"entropy": 1.11875,
"epoch": 1.189011890118901,
"grad_norm": 0.07010020908030337,
"learning_rate": 2.09067211497816e-06,
"loss": 1.112,
"mean_token_accuracy": 0.7327292680740356,
"num_tokens": 801839997.0,
"step": 8700
},
{
"entropy": 1.148828125,
"epoch": 1.1903785704523713,
"grad_norm": 0.08380536531093119,
"learning_rate": 2.087149499788643e-06,
"loss": 1.1454,
"mean_token_accuracy": 0.7258519172668457,
"num_tokens": 802754802.0,
"step": 8710
},
{
"entropy": 1.13828125,
"epoch": 1.1917452507858413,
"grad_norm": 0.07932575023167572,
"learning_rate": 2.0836268845991263e-06,
"loss": 1.1376,
"mean_token_accuracy": 0.7273097932338715,
"num_tokens": 803675305.0,
"step": 8720
},
{
"entropy": 1.1515625,
"epoch": 1.1931119311193112,
"grad_norm": 0.07993407470940914,
"learning_rate": 2.08010426940961e-06,
"loss": 1.1653,
"mean_token_accuracy": 0.7255385935306549,
"num_tokens": 804626080.0,
"step": 8730
},
{
"entropy": 1.1703125,
"epoch": 1.1944786114527812,
"grad_norm": 0.07467356089444765,
"learning_rate": 2.076581654220093e-06,
"loss": 1.1837,
"mean_token_accuracy": 0.7207926034927368,
"num_tokens": 805552114.0,
"step": 8740
},
{
"entropy": 1.115625,
"epoch": 1.1958452917862512,
"grad_norm": 0.0776834357513849,
"learning_rate": 2.0730590390305765e-06,
"loss": 1.1129,
"mean_token_accuracy": 0.7323619186878204,
"num_tokens": 806462200.0,
"step": 8750
},
{
"entropy": 1.15390625,
"epoch": 1.1972119721197212,
"grad_norm": 0.07602253532829839,
"learning_rate": 2.0695364238410596e-06,
"loss": 1.1357,
"mean_token_accuracy": 0.7275901913642884,
"num_tokens": 807437504.0,
"step": 8760
},
{
"entropy": 1.1421875,
"epoch": 1.1985786524531912,
"grad_norm": 0.06595220683371972,
"learning_rate": 2.066013808651543e-06,
"loss": 1.1474,
"mean_token_accuracy": 0.7252663850784302,
"num_tokens": 808356776.0,
"step": 8770
},
{
"entropy": 1.1875,
"epoch": 1.1999453327866612,
"grad_norm": 0.08084786077398104,
"learning_rate": 2.0624911934620263e-06,
"loss": 1.1885,
"mean_token_accuracy": 0.7198069036006928,
"num_tokens": 809235508.0,
"step": 8780
},
{
"entropy": 1.1734375,
"epoch": 1.2013120131201311,
"grad_norm": 0.07459615232699228,
"learning_rate": 2.05896857827251e-06,
"loss": 1.1757,
"mean_token_accuracy": 0.7199897289276123,
"num_tokens": 810205113.0,
"step": 8790
},
{
"entropy": 1.121484375,
"epoch": 1.2026786934536011,
"grad_norm": 0.07438645489050084,
"learning_rate": 2.055445963082993e-06,
"loss": 1.1198,
"mean_token_accuracy": 0.7345019221305847,
"num_tokens": 811113331.0,
"step": 8800
},
{
"entropy": 1.121875,
"epoch": 1.2040453737870713,
"grad_norm": 0.07251073836182893,
"learning_rate": 2.0519233478934765e-06,
"loss": 1.1333,
"mean_token_accuracy": 0.7288311719894409,
"num_tokens": 812063842.0,
"step": 8810
},
{
"entropy": 1.150390625,
"epoch": 1.2054120541205413,
"grad_norm": 0.07447891227059372,
"learning_rate": 2.0484007327039596e-06,
"loss": 1.1536,
"mean_token_accuracy": 0.7244961082935333,
"num_tokens": 812974009.0,
"step": 8820
},
{
"entropy": 1.158984375,
"epoch": 1.2067787344540113,
"grad_norm": 0.07465814914682196,
"learning_rate": 2.0448781175144427e-06,
"loss": 1.1477,
"mean_token_accuracy": 0.7266793489456177,
"num_tokens": 813886829.0,
"step": 8830
},
{
"entropy": 1.1390625,
"epoch": 1.2081454147874813,
"grad_norm": 0.07613529924984616,
"learning_rate": 2.0413555023249263e-06,
"loss": 1.1386,
"mean_token_accuracy": 0.7287739276885986,
"num_tokens": 814818796.0,
"step": 8840
},
{
"entropy": 1.14140625,
"epoch": 1.2095120951209513,
"grad_norm": 0.07838321627093393,
"learning_rate": 2.0378328871354094e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.7273074924945832,
"num_tokens": 815777038.0,
"step": 8850
},
{
"entropy": 1.1390625,
"epoch": 1.2108787754544212,
"grad_norm": 0.0699839715408524,
"learning_rate": 2.034310271945893e-06,
"loss": 1.145,
"mean_token_accuracy": 0.728177833557129,
"num_tokens": 816742274.0,
"step": 8860
},
{
"entropy": 1.140625,
"epoch": 1.2122454557878912,
"grad_norm": 0.07358690034432123,
"learning_rate": 2.030787656756376e-06,
"loss": 1.1363,
"mean_token_accuracy": 0.7278168439865113,
"num_tokens": 817650313.0,
"step": 8870
},
{
"entropy": 1.13359375,
"epoch": 1.2136121361213612,
"grad_norm": 0.0711824737455651,
"learning_rate": 2.027265041566859e-06,
"loss": 1.1483,
"mean_token_accuracy": 0.7268934965133667,
"num_tokens": 818564569.0,
"step": 8880
},
{
"entropy": 1.13046875,
"epoch": 1.2149788164548312,
"grad_norm": 0.07781172943546648,
"learning_rate": 2.0237424263773427e-06,
"loss": 1.1219,
"mean_token_accuracy": 0.7314777791500091,
"num_tokens": 819452714.0,
"step": 8890
},
{
"entropy": 1.10390625,
"epoch": 1.2163454967883012,
"grad_norm": 0.06708819232651604,
"learning_rate": 2.020219811187826e-06,
"loss": 1.1067,
"mean_token_accuracy": 0.7337666392326355,
"num_tokens": 820395120.0,
"step": 8900
},
{
"entropy": 1.15,
"epoch": 1.2177121771217712,
"grad_norm": 0.06823548206382402,
"learning_rate": 2.0166971959983094e-06,
"loss": 1.1653,
"mean_token_accuracy": 0.722117829322815,
"num_tokens": 821331637.0,
"step": 8910
},
{
"entropy": 1.1046875,
"epoch": 1.2190788574552411,
"grad_norm": 0.07165462101633481,
"learning_rate": 2.0131745808087925e-06,
"loss": 1.1129,
"mean_token_accuracy": 0.7339481711387634,
"num_tokens": 822262363.0,
"step": 8920
},
{
"entropy": 1.119921875,
"epoch": 1.2204455377887111,
"grad_norm": 0.07718557053472008,
"learning_rate": 2.009651965619276e-06,
"loss": 1.1196,
"mean_token_accuracy": 0.7312289237976074,
"num_tokens": 823185730.0,
"step": 8930
},
{
"entropy": 1.16171875,
"epoch": 1.221812218122181,
"grad_norm": 0.07496350150180707,
"learning_rate": 2.0061293504297592e-06,
"loss": 1.1751,
"mean_token_accuracy": 0.7216550290584565,
"num_tokens": 824130635.0,
"step": 8940
},
{
"entropy": 1.096484375,
"epoch": 1.2231788984556513,
"grad_norm": 0.07370331659270653,
"learning_rate": 2.0026067352402428e-06,
"loss": 1.0984,
"mean_token_accuracy": 0.735461413860321,
"num_tokens": 825059981.0,
"step": 8950
},
{
"entropy": 1.156640625,
"epoch": 1.2245455787891213,
"grad_norm": 0.07538360227399012,
"learning_rate": 1.999084120050726e-06,
"loss": 1.1613,
"mean_token_accuracy": 0.7235375583171845,
"num_tokens": 825984443.0,
"step": 8960
},
{
"entropy": 1.110546875,
"epoch": 1.2259122591225913,
"grad_norm": 0.0839890159718588,
"learning_rate": 1.9955615048612094e-06,
"loss": 1.1092,
"mean_token_accuracy": 0.7340769290924072,
"num_tokens": 826900181.0,
"step": 8970
},
{
"entropy": 1.14296875,
"epoch": 1.2272789394560613,
"grad_norm": 0.13910046595278502,
"learning_rate": 1.9920388896716926e-06,
"loss": 1.149,
"mean_token_accuracy": 0.7263824820518494,
"num_tokens": 827834339.0,
"step": 8980
},
{
"entropy": 1.15,
"epoch": 1.2286456197895312,
"grad_norm": 0.07032035206647684,
"learning_rate": 1.9885162744821757e-06,
"loss": 1.1628,
"mean_token_accuracy": 0.7262707948684692,
"num_tokens": 828755267.0,
"step": 8990
},
{
"entropy": 1.13515625,
"epoch": 1.2300123001230012,
"grad_norm": 0.07735446655338968,
"learning_rate": 1.9849936592926592e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.7272780478000641,
"num_tokens": 829615451.0,
"step": 9000
},
{
"entropy": 1.12109375,
"epoch": 1.2313789804564712,
"grad_norm": 0.07394324379560933,
"learning_rate": 1.9814710441031424e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.7318297445774078,
"num_tokens": 830524527.0,
"step": 9010
},
{
"entropy": 1.1109375,
"epoch": 1.2327456607899412,
"grad_norm": 0.07760455757542029,
"learning_rate": 1.9779484289136255e-06,
"loss": 1.0944,
"mean_token_accuracy": 0.7374783456325531,
"num_tokens": 831423667.0,
"step": 9020
},
{
"entropy": 1.159375,
"epoch": 1.2341123411234112,
"grad_norm": 0.07460604134318964,
"learning_rate": 1.974425813724109e-06,
"loss": 1.162,
"mean_token_accuracy": 0.7236018359661103,
"num_tokens": 832352867.0,
"step": 9030
},
{
"entropy": 1.125,
"epoch": 1.2354790214568812,
"grad_norm": 0.0702599541689073,
"learning_rate": 1.970903198534592e-06,
"loss": 1.1308,
"mean_token_accuracy": 0.7292075872421264,
"num_tokens": 833293798.0,
"step": 9040
},
{
"entropy": 1.15625,
"epoch": 1.2368457017903514,
"grad_norm": 0.08094064536661051,
"learning_rate": 1.9673805833450753e-06,
"loss": 1.1484,
"mean_token_accuracy": 0.7244746506214141,
"num_tokens": 834230235.0,
"step": 9050
},
{
"entropy": 1.1671875,
"epoch": 1.2382123821238213,
"grad_norm": 0.0666012796672545,
"learning_rate": 1.963857968155559e-06,
"loss": 1.1608,
"mean_token_accuracy": 0.7236746191978455,
"num_tokens": 835171316.0,
"step": 9060
},
{
"entropy": 1.11796875,
"epoch": 1.2395790624572913,
"grad_norm": 0.08333767084378838,
"learning_rate": 1.960335352966042e-06,
"loss": 1.1147,
"mean_token_accuracy": 0.7337950646877289,
"num_tokens": 836094206.0,
"step": 9070
},
{
"entropy": 1.16640625,
"epoch": 1.2409457427907613,
"grad_norm": 0.08268070742776759,
"learning_rate": 1.9568127377765255e-06,
"loss": 1.166,
"mean_token_accuracy": 0.7216145694255829,
"num_tokens": 837040128.0,
"step": 9080
},
{
"entropy": 1.09921875,
"epoch": 1.2423124231242313,
"grad_norm": 0.06859934027489586,
"learning_rate": 1.9532901225870086e-06,
"loss": 1.0994,
"mean_token_accuracy": 0.7348327755928039,
"num_tokens": 838015487.0,
"step": 9090
},
{
"entropy": 1.13671875,
"epoch": 1.2436791034577013,
"grad_norm": 0.08071572244851227,
"learning_rate": 1.949767507397492e-06,
"loss": 1.1319,
"mean_token_accuracy": 0.7284305691719055,
"num_tokens": 838908651.0,
"step": 9100
},
{
"entropy": 1.169921875,
"epoch": 1.2450457837911713,
"grad_norm": 0.07348118444230126,
"learning_rate": 1.9462448922079753e-06,
"loss": 1.1734,
"mean_token_accuracy": 0.7217342257499695,
"num_tokens": 839819407.0,
"step": 9110
},
{
"entropy": 1.14375,
"epoch": 1.2464124641246412,
"grad_norm": 0.13604479899468314,
"learning_rate": 1.942722277018459e-06,
"loss": 1.1525,
"mean_token_accuracy": 0.7272558152675629,
"num_tokens": 840716632.0,
"step": 9120
},
{
"entropy": 1.18046875,
"epoch": 1.2477791444581112,
"grad_norm": 0.06977526489195568,
"learning_rate": 1.939199661828942e-06,
"loss": 1.188,
"mean_token_accuracy": 0.7175242483615876,
"num_tokens": 841628747.0,
"step": 9130
},
{
"entropy": 1.13203125,
"epoch": 1.2491458247915812,
"grad_norm": 0.07584609332758994,
"learning_rate": 1.9356770466394255e-06,
"loss": 1.1385,
"mean_token_accuracy": 0.729504132270813,
"num_tokens": 842477808.0,
"step": 9140
},
{
"entropy": 1.115625,
"epoch": 1.2505125051250512,
"grad_norm": 0.0702106778659683,
"learning_rate": 1.9321544314499086e-06,
"loss": 1.1065,
"mean_token_accuracy": 0.734652304649353,
"num_tokens": 843389427.0,
"step": 9150
},
{
"entropy": 1.134375,
"epoch": 1.2518791854585212,
"grad_norm": 0.08075416205490918,
"learning_rate": 1.9286318162603917e-06,
"loss": 1.1371,
"mean_token_accuracy": 0.7268370747566223,
"num_tokens": 844326133.0,
"step": 9160
},
{
"entropy": 1.109375,
"epoch": 1.2532458657919912,
"grad_norm": 0.06995120658896171,
"learning_rate": 1.9251092010708753e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.7341335117816925,
"num_tokens": 845242173.0,
"step": 9170
},
{
"entropy": 1.15703125,
"epoch": 1.2546125461254611,
"grad_norm": 0.0631341449337719,
"learning_rate": 1.9215865858813584e-06,
"loss": 1.1684,
"mean_token_accuracy": 0.7230552017688752,
"num_tokens": 846159471.0,
"step": 9180
},
{
"entropy": 1.16953125,
"epoch": 1.2559792264589311,
"grad_norm": 0.07132106109251457,
"learning_rate": 1.918063970691842e-06,
"loss": 1.1776,
"mean_token_accuracy": 0.7198686003684998,
"num_tokens": 847086459.0,
"step": 9190
},
{
"entropy": 1.143359375,
"epoch": 1.2573459067924013,
"grad_norm": 0.07867357568069326,
"learning_rate": 1.914541355502325e-06,
"loss": 1.132,
"mean_token_accuracy": 0.7273231208324432,
"num_tokens": 847965115.0,
"step": 9200
},
{
"entropy": 1.13046875,
"epoch": 1.2587125871258713,
"grad_norm": 0.07222561462939713,
"learning_rate": 1.9110187403128082e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.7287319421768188,
"num_tokens": 848874699.0,
"step": 9210
},
{
"entropy": 1.142578125,
"epoch": 1.2600792674593413,
"grad_norm": 0.0699376909977332,
"learning_rate": 1.9074961251232918e-06,
"loss": 1.1482,
"mean_token_accuracy": 0.7261109411716461,
"num_tokens": 849836872.0,
"step": 9220
},
{
"entropy": 1.1234375,
"epoch": 1.2614459477928113,
"grad_norm": 0.07839078864445471,
"learning_rate": 1.903973509933775e-06,
"loss": 1.1197,
"mean_token_accuracy": 0.7293895900249481,
"num_tokens": 850740853.0,
"step": 9230
},
{
"entropy": 1.175,
"epoch": 1.2628126281262813,
"grad_norm": 0.07315644950478395,
"learning_rate": 1.9004508947442582e-06,
"loss": 1.1659,
"mean_token_accuracy": 0.7224130213260651,
"num_tokens": 851633495.0,
"step": 9240
},
{
"entropy": 1.104296875,
"epoch": 1.2641793084597512,
"grad_norm": 0.0796782315373933,
"learning_rate": 1.8969282795547418e-06,
"loss": 1.109,
"mean_token_accuracy": 0.7337846159934998,
"num_tokens": 852532177.0,
"step": 9250
},
{
"entropy": 1.1453125,
"epoch": 1.2655459887932212,
"grad_norm": 0.073445018992185,
"learning_rate": 1.8934056643652249e-06,
"loss": 1.1427,
"mean_token_accuracy": 0.7261588096618652,
"num_tokens": 853442859.0,
"step": 9260
},
{
"entropy": 1.1296875,
"epoch": 1.2669126691266912,
"grad_norm": 0.07342574051017156,
"learning_rate": 1.889883049175708e-06,
"loss": 1.1277,
"mean_token_accuracy": 0.7296112954616547,
"num_tokens": 854344107.0,
"step": 9270
},
{
"entropy": 1.08046875,
"epoch": 1.2682793494601612,
"grad_norm": 0.07278892609186548,
"learning_rate": 1.8863604339861916e-06,
"loss": 1.0801,
"mean_token_accuracy": 0.7384391844272613,
"num_tokens": 855262838.0,
"step": 9280
},
{
"entropy": 1.16171875,
"epoch": 1.2696460297936314,
"grad_norm": 0.07615503214081153,
"learning_rate": 1.8828378187966747e-06,
"loss": 1.17,
"mean_token_accuracy": 0.7233425080776215,
"num_tokens": 856200075.0,
"step": 9290
},
{
"entropy": 1.12578125,
"epoch": 1.2710127101271014,
"grad_norm": 0.07410217268984316,
"learning_rate": 1.8793152036071582e-06,
"loss": 1.1329,
"mean_token_accuracy": 0.7287874817848206,
"num_tokens": 857113863.0,
"step": 9300
},
{
"entropy": 1.1078125,
"epoch": 1.2723793904605714,
"grad_norm": 0.07079163147628535,
"learning_rate": 1.8757925884176414e-06,
"loss": 1.1087,
"mean_token_accuracy": 0.7323549926280976,
"num_tokens": 858051807.0,
"step": 9310
},
{
"entropy": 1.1546875,
"epoch": 1.2737460707940413,
"grad_norm": 0.07353078977278218,
"learning_rate": 1.8722699732281247e-06,
"loss": 1.1575,
"mean_token_accuracy": 0.7236910462379456,
"num_tokens": 859041307.0,
"step": 9320
},
{
"entropy": 1.178125,
"epoch": 1.2751127511275113,
"grad_norm": 0.07541457434188527,
"learning_rate": 1.868747358038608e-06,
"loss": 1.1735,
"mean_token_accuracy": 0.7210469007492065,
"num_tokens": 859948415.0,
"step": 9330
},
{
"entropy": 1.10625,
"epoch": 1.2764794314609813,
"grad_norm": 0.07211916392287372,
"learning_rate": 1.8652247428490914e-06,
"loss": 1.103,
"mean_token_accuracy": 0.735126155614853,
"num_tokens": 860865275.0,
"step": 9340
},
{
"entropy": 1.1625,
"epoch": 1.2778461117944513,
"grad_norm": 0.07617257953563018,
"learning_rate": 1.8617021276595745e-06,
"loss": 1.1736,
"mean_token_accuracy": 0.7231076717376709,
"num_tokens": 861771377.0,
"step": 9350
},
{
"entropy": 1.1453125,
"epoch": 1.2792127921279213,
"grad_norm": 0.07874224805315218,
"learning_rate": 1.858179512470058e-06,
"loss": 1.1506,
"mean_token_accuracy": 0.727257376909256,
"num_tokens": 862681652.0,
"step": 9360
},
{
"entropy": 1.11953125,
"epoch": 1.2805794724613913,
"grad_norm": 0.0681819842280324,
"learning_rate": 1.8546568972805412e-06,
"loss": 1.1263,
"mean_token_accuracy": 0.7309780240058898,
"num_tokens": 863660677.0,
"step": 9370
},
{
"entropy": 1.1046875,
"epoch": 1.2819461527948612,
"grad_norm": 0.07478706160695187,
"learning_rate": 1.8511342820910245e-06,
"loss": 1.1024,
"mean_token_accuracy": 0.7339930713176728,
"num_tokens": 864575910.0,
"step": 9380
},
{
"entropy": 1.09609375,
"epoch": 1.2833128331283312,
"grad_norm": 0.072517557563058,
"learning_rate": 1.8476116669015078e-06,
"loss": 1.0975,
"mean_token_accuracy": 0.7368973612785339,
"num_tokens": 865515416.0,
"step": 9390
},
{
"entropy": 1.115234375,
"epoch": 1.2846795134618012,
"grad_norm": 0.06633148851254346,
"learning_rate": 1.8440890517119912e-06,
"loss": 1.1122,
"mean_token_accuracy": 0.733020794391632,
"num_tokens": 866447102.0,
"step": 9400
},
{
"entropy": 1.1296875,
"epoch": 1.2860461937952712,
"grad_norm": 0.07874359668363462,
"learning_rate": 1.8405664365224743e-06,
"loss": 1.1257,
"mean_token_accuracy": 0.7299791276454926,
"num_tokens": 867334361.0,
"step": 9410
},
{
"entropy": 1.16796875,
"epoch": 1.2874128741287412,
"grad_norm": 0.07726840674265666,
"learning_rate": 1.8370438213329578e-06,
"loss": 1.1859,
"mean_token_accuracy": 0.719907408952713,
"num_tokens": 868306198.0,
"step": 9420
},
{
"entropy": 1.15546875,
"epoch": 1.2887795544622112,
"grad_norm": 0.07687884999608567,
"learning_rate": 1.833521206143441e-06,
"loss": 1.1477,
"mean_token_accuracy": 0.7269446551799774,
"num_tokens": 869231963.0,
"step": 9430
},
{
"entropy": 1.19375,
"epoch": 1.2901462347956814,
"grad_norm": 0.07416649183473273,
"learning_rate": 1.8299985909539245e-06,
"loss": 1.1911,
"mean_token_accuracy": 0.7178772866725922,
"num_tokens": 870118000.0,
"step": 9440
},
{
"entropy": 1.17734375,
"epoch": 1.2915129151291513,
"grad_norm": 0.07690607413587268,
"learning_rate": 1.8264759757644076e-06,
"loss": 1.1744,
"mean_token_accuracy": 0.7225610196590424,
"num_tokens": 871032920.0,
"step": 9450
},
{
"entropy": 1.1265625,
"epoch": 1.2928795954626213,
"grad_norm": 0.08896069330593936,
"learning_rate": 1.822953360574891e-06,
"loss": 1.1272,
"mean_token_accuracy": 0.7300606429576874,
"num_tokens": 871910994.0,
"step": 9460
},
{
"entropy": 1.1125,
"epoch": 1.2942462757960913,
"grad_norm": 0.06356215891001715,
"learning_rate": 1.8194307453853743e-06,
"loss": 1.107,
"mean_token_accuracy": 0.734264588356018,
"num_tokens": 872819239.0,
"step": 9470
},
{
"entropy": 1.17265625,
"epoch": 1.2956129561295613,
"grad_norm": 0.07834590224188576,
"learning_rate": 1.8159081301958576e-06,
"loss": 1.1813,
"mean_token_accuracy": 0.721282160282135,
"num_tokens": 873762298.0,
"step": 9480
},
{
"entropy": 1.13046875,
"epoch": 1.2969796364630313,
"grad_norm": 0.0657228632087804,
"learning_rate": 1.8123855150063408e-06,
"loss": 1.1282,
"mean_token_accuracy": 0.7300038278102875,
"num_tokens": 874698488.0,
"step": 9490
},
{
"entropy": 1.1625,
"epoch": 1.2983463167965013,
"grad_norm": 0.10473983622198,
"learning_rate": 1.8088628998168243e-06,
"loss": 1.1859,
"mean_token_accuracy": 0.7205550611019135,
"num_tokens": 875593896.0,
"step": 9500
},
{
"entropy": 1.10703125,
"epoch": 1.2997129971299712,
"grad_norm": 0.07860910258076403,
"learning_rate": 1.8053402846273074e-06,
"loss": 1.0959,
"mean_token_accuracy": 0.7358021855354309,
"num_tokens": 876502712.0,
"step": 9510
},
{
"entropy": 1.069921875,
"epoch": 1.3010796774634412,
"grad_norm": 0.07033955075002749,
"learning_rate": 1.8018176694377906e-06,
"loss": 1.0762,
"mean_token_accuracy": 0.7384052753448487,
"num_tokens": 877406242.0,
"step": 9520
},
{
"entropy": 1.116015625,
"epoch": 1.3024463577969114,
"grad_norm": 0.0714230796613332,
"learning_rate": 1.798295054248274e-06,
"loss": 1.1243,
"mean_token_accuracy": 0.7297037243843079,
"num_tokens": 878297818.0,
"step": 9530
},
{
"entropy": 1.1140625,
"epoch": 1.3038130381303814,
"grad_norm": 0.08020621464121885,
"learning_rate": 1.7947724390587572e-06,
"loss": 1.1073,
"mean_token_accuracy": 0.7340352952480316,
"num_tokens": 879197968.0,
"step": 9540
},
{
"entropy": 1.1375,
"epoch": 1.3051797184638514,
"grad_norm": 0.0850141916978718,
"learning_rate": 1.7912498238692408e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.7264250040054321,
"num_tokens": 880144165.0,
"step": 9550
},
{
"entropy": 1.1171875,
"epoch": 1.3065463987973214,
"grad_norm": 0.07846491055694123,
"learning_rate": 1.7877272086797239e-06,
"loss": 1.1326,
"mean_token_accuracy": 0.7301059126853943,
"num_tokens": 881067277.0,
"step": 9560
},
{
"entropy": 1.14765625,
"epoch": 1.3079130791307914,
"grad_norm": 0.07671809600940627,
"learning_rate": 1.7842045934902072e-06,
"loss": 1.1445,
"mean_token_accuracy": 0.7251434326171875,
"num_tokens": 882005499.0,
"step": 9570
},
{
"entropy": 1.137109375,
"epoch": 1.3092797594642613,
"grad_norm": 0.15239688632717907,
"learning_rate": 1.7806819783006908e-06,
"loss": 1.1271,
"mean_token_accuracy": 0.730139684677124,
"num_tokens": 882921824.0,
"step": 9580
},
{
"entropy": 1.1109375,
"epoch": 1.3106464397977313,
"grad_norm": 0.07247736194069035,
"learning_rate": 1.777159363111174e-06,
"loss": 1.1221,
"mean_token_accuracy": 0.7323022365570069,
"num_tokens": 883846035.0,
"step": 9590
},
{
"entropy": 1.128125,
"epoch": 1.3120131201312013,
"grad_norm": 0.0692601883165614,
"learning_rate": 1.773636747921657e-06,
"loss": 1.1171,
"mean_token_accuracy": 0.7314056038856507,
"num_tokens": 884719258.0,
"step": 9600
},
{
"entropy": 1.12890625,
"epoch": 1.3133798004646713,
"grad_norm": 0.06989779442856107,
"learning_rate": 1.7701141327321406e-06,
"loss": 1.1182,
"mean_token_accuracy": 0.733076137304306,
"num_tokens": 885675336.0,
"step": 9610
},
{
"entropy": 1.148828125,
"epoch": 1.3147464807981413,
"grad_norm": 0.07126553948108497,
"learning_rate": 1.7665915175426237e-06,
"loss": 1.1495,
"mean_token_accuracy": 0.7275227248668671,
"num_tokens": 886645805.0,
"step": 9620
},
{
"entropy": 1.13671875,
"epoch": 1.3161131611316113,
"grad_norm": 0.07086840193047149,
"learning_rate": 1.763068902353107e-06,
"loss": 1.1439,
"mean_token_accuracy": 0.7260739624500274,
"num_tokens": 887541444.0,
"step": 9630
},
{
"entropy": 1.175,
"epoch": 1.3174798414650812,
"grad_norm": 0.08906250571814406,
"learning_rate": 1.7595462871635904e-06,
"loss": 1.1881,
"mean_token_accuracy": 0.7177520334720612,
"num_tokens": 888431199.0,
"step": 9640
},
{
"entropy": 1.089453125,
"epoch": 1.3188465217985512,
"grad_norm": 0.06994352555105594,
"learning_rate": 1.7560236719740737e-06,
"loss": 1.0878,
"mean_token_accuracy": 0.7356501638889312,
"num_tokens": 889364908.0,
"step": 9650
},
{
"entropy": 1.14296875,
"epoch": 1.3202132021320212,
"grad_norm": 0.07111222626095962,
"learning_rate": 1.752501056784557e-06,
"loss": 1.1331,
"mean_token_accuracy": 0.7323017120361328,
"num_tokens": 890313125.0,
"step": 9660
},
{
"entropy": 1.140625,
"epoch": 1.3215798824654912,
"grad_norm": 0.07067351682665791,
"learning_rate": 1.7489784415950404e-06,
"loss": 1.141,
"mean_token_accuracy": 0.7260723769664764,
"num_tokens": 891198196.0,
"step": 9670
},
{
"entropy": 1.1140625,
"epoch": 1.3229465627989614,
"grad_norm": 0.0743462265797407,
"learning_rate": 1.7454558264055235e-06,
"loss": 1.1061,
"mean_token_accuracy": 0.7342061817646026,
"num_tokens": 892094044.0,
"step": 9680
},
{
"entropy": 1.12734375,
"epoch": 1.3243132431324314,
"grad_norm": 0.0728162953557407,
"learning_rate": 1.741933211216007e-06,
"loss": 1.1322,
"mean_token_accuracy": 0.7305579841136932,
"num_tokens": 892984168.0,
"step": 9690
},
{
"entropy": 1.15078125,
"epoch": 1.3256799234659014,
"grad_norm": 0.07876518373335618,
"learning_rate": 1.7384105960264902e-06,
"loss": 1.1494,
"mean_token_accuracy": 0.7274425148963928,
"num_tokens": 893912834.0,
"step": 9700
},
{
"entropy": 1.18125,
"epoch": 1.3270466037993713,
"grad_norm": 0.07468219599973004,
"learning_rate": 1.7348879808369735e-06,
"loss": 1.1763,
"mean_token_accuracy": 0.7181297361850738,
"num_tokens": 894862912.0,
"step": 9710
},
{
"entropy": 1.10390625,
"epoch": 1.3284132841328413,
"grad_norm": 0.07983612643366692,
"learning_rate": 1.7313653656474568e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.7322055220603942,
"num_tokens": 895810211.0,
"step": 9720
},
{
"entropy": 1.141015625,
"epoch": 1.3297799644663113,
"grad_norm": 0.08322259502422936,
"learning_rate": 1.7278427504579402e-06,
"loss": 1.1511,
"mean_token_accuracy": 0.7258865118026734,
"num_tokens": 896767345.0,
"step": 9730
},
{
"entropy": 1.1625,
"epoch": 1.3311466447997813,
"grad_norm": 0.07379960973959934,
"learning_rate": 1.7243201352684233e-06,
"loss": 1.1817,
"mean_token_accuracy": 0.7205169260501861,
"num_tokens": 897730037.0,
"step": 9740
},
{
"entropy": 1.146875,
"epoch": 1.3325133251332513,
"grad_norm": 0.07870153386066578,
"learning_rate": 1.7207975200789068e-06,
"loss": 1.1509,
"mean_token_accuracy": 0.7253862857818604,
"num_tokens": 898654053.0,
"step": 9750
},
{
"entropy": 1.117578125,
"epoch": 1.3338800054667213,
"grad_norm": 0.07999621552418364,
"learning_rate": 1.71727490488939e-06,
"loss": 1.1131,
"mean_token_accuracy": 0.7319980442523957,
"num_tokens": 899526007.0,
"step": 9760
},
{
"entropy": 1.128125,
"epoch": 1.3352466858001915,
"grad_norm": 0.07822832606546781,
"learning_rate": 1.7137522896998735e-06,
"loss": 1.1488,
"mean_token_accuracy": 0.7261813938617706,
"num_tokens": 900457138.0,
"step": 9770
},
{
"entropy": 1.106640625,
"epoch": 1.3366133661336614,
"grad_norm": 0.07454192843167824,
"learning_rate": 1.7102296745103566e-06,
"loss": 1.1093,
"mean_token_accuracy": 0.7348284780979156,
"num_tokens": 901342348.0,
"step": 9780
},
{
"entropy": 1.1578125,
"epoch": 1.3379800464671314,
"grad_norm": 0.07540675576989608,
"learning_rate": 1.7067070593208398e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.7248075664043426,
"num_tokens": 902289247.0,
"step": 9790
},
{
"entropy": 1.13359375,
"epoch": 1.3393467268006014,
"grad_norm": 0.07593957919235479,
"learning_rate": 1.7031844441313233e-06,
"loss": 1.1348,
"mean_token_accuracy": 0.7294245481491088,
"num_tokens": 903216964.0,
"step": 9800
},
{
"entropy": 1.12421875,
"epoch": 1.3407134071340714,
"grad_norm": 0.07683177616986779,
"learning_rate": 1.6996618289418064e-06,
"loss": 1.1332,
"mean_token_accuracy": 0.730054897069931,
"num_tokens": 904149583.0,
"step": 9810
},
{
"entropy": 1.14375,
"epoch": 1.3420800874675414,
"grad_norm": 0.07896732975258833,
"learning_rate": 1.6961392137522898e-06,
"loss": 1.1478,
"mean_token_accuracy": 0.7252162575721741,
"num_tokens": 905019600.0,
"step": 9820
},
{
"entropy": 1.105078125,
"epoch": 1.3434467678010114,
"grad_norm": 0.08374792452754906,
"learning_rate": 1.6926165985627733e-06,
"loss": 1.0911,
"mean_token_accuracy": 0.7372736990451813,
"num_tokens": 905895865.0,
"step": 9830
},
{
"entropy": 1.15078125,
"epoch": 1.3448134481344813,
"grad_norm": 0.07184841565143066,
"learning_rate": 1.6890939833732564e-06,
"loss": 1.1622,
"mean_token_accuracy": 0.7226279973983765,
"num_tokens": 906847363.0,
"step": 9840
},
{
"entropy": 1.1609375,
"epoch": 1.3461801284679513,
"grad_norm": 0.07518717519460955,
"learning_rate": 1.6855713681837396e-06,
"loss": 1.1689,
"mean_token_accuracy": 0.7243802726268769,
"num_tokens": 907782159.0,
"step": 9850
},
{
"entropy": 1.113671875,
"epoch": 1.3475468088014213,
"grad_norm": 0.07084378590412212,
"learning_rate": 1.6820487529942231e-06,
"loss": 1.1025,
"mean_token_accuracy": 0.7349331319332123,
"num_tokens": 908694428.0,
"step": 9860
},
{
"entropy": 1.105078125,
"epoch": 1.3489134891348913,
"grad_norm": 0.07769286253411409,
"learning_rate": 1.6785261378047062e-06,
"loss": 1.0981,
"mean_token_accuracy": 0.7351276755332947,
"num_tokens": 909610885.0,
"step": 9870
},
{
"entropy": 1.121875,
"epoch": 1.3502801694683613,
"grad_norm": 0.07643970710027267,
"learning_rate": 1.6750035226151898e-06,
"loss": 1.1123,
"mean_token_accuracy": 0.7324855864048004,
"num_tokens": 910486074.0,
"step": 9880
},
{
"entropy": 1.125,
"epoch": 1.3516468498018313,
"grad_norm": 0.08362491968517799,
"learning_rate": 1.671480907425673e-06,
"loss": 1.122,
"mean_token_accuracy": 0.7315805435180665,
"num_tokens": 911424517.0,
"step": 9890
},
{
"entropy": 1.15390625,
"epoch": 1.3530135301353012,
"grad_norm": 0.07661291616709345,
"learning_rate": 1.6679582922361562e-06,
"loss": 1.1576,
"mean_token_accuracy": 0.7253268778324127,
"num_tokens": 912321096.0,
"step": 9900
},
{
"entropy": 1.1359375,
"epoch": 1.3543802104687712,
"grad_norm": 0.07112926740065234,
"learning_rate": 1.6644356770466396e-06,
"loss": 1.139,
"mean_token_accuracy": 0.7273467957973481,
"num_tokens": 913256636.0,
"step": 9910
},
{
"entropy": 1.13828125,
"epoch": 1.3557468908022414,
"grad_norm": 0.07343903076014971,
"learning_rate": 1.660913061857123e-06,
"loss": 1.1405,
"mean_token_accuracy": 0.728032648563385,
"num_tokens": 914191751.0,
"step": 9920
},
{
"entropy": 1.0859375,
"epoch": 1.3571135711357114,
"grad_norm": 0.07357381099837174,
"learning_rate": 1.657390446667606e-06,
"loss": 1.0723,
"mean_token_accuracy": 0.7418630719184875,
"num_tokens": 915113189.0,
"step": 9930
},
{
"entropy": 1.1125,
"epoch": 1.3584802514691814,
"grad_norm": 0.07386608580989049,
"learning_rate": 1.6538678314780896e-06,
"loss": 1.12,
"mean_token_accuracy": 0.7314802050590515,
"num_tokens": 916020500.0,
"step": 9940
},
{
"entropy": 1.20234375,
"epoch": 1.3598469318026514,
"grad_norm": 0.08057570085988527,
"learning_rate": 1.6503452162885727e-06,
"loss": 1.2058,
"mean_token_accuracy": 0.7151572525501251,
"num_tokens": 916915733.0,
"step": 9950
},
{
"entropy": 1.144921875,
"epoch": 1.3612136121361214,
"grad_norm": 0.07591252446545281,
"learning_rate": 1.646822601099056e-06,
"loss": 1.1446,
"mean_token_accuracy": 0.7271172940731049,
"num_tokens": 917832257.0,
"step": 9960
},
{
"entropy": 1.119140625,
"epoch": 1.3625802924695913,
"grad_norm": 0.0769564128293883,
"learning_rate": 1.6432999859095394e-06,
"loss": 1.14,
"mean_token_accuracy": 0.7286533236503601,
"num_tokens": 918751964.0,
"step": 9970
},
{
"entropy": 1.16328125,
"epoch": 1.3639469728030613,
"grad_norm": 0.07828746905628316,
"learning_rate": 1.6397773707200227e-06,
"loss": 1.1654,
"mean_token_accuracy": 0.7222230672836304,
"num_tokens": 919683718.0,
"step": 9980
},
{
"entropy": 1.11171875,
"epoch": 1.3653136531365313,
"grad_norm": 0.07490978722133786,
"learning_rate": 1.636254755530506e-06,
"loss": 1.1203,
"mean_token_accuracy": 0.7297917127609252,
"num_tokens": 920567825.0,
"step": 9990
},
{
"entropy": 1.051953125,
"epoch": 1.3666803334700013,
"grad_norm": 0.07119670873227979,
"learning_rate": 1.6327321403409894e-06,
"loss": 1.0537,
"mean_token_accuracy": 0.743243145942688,
"num_tokens": 921479711.0,
"step": 10000
},
{
"entropy": 1.13671875,
"epoch": 1.3680470138034715,
"grad_norm": 0.07661283229101566,
"learning_rate": 1.6292095251514725e-06,
"loss": 1.1462,
"mean_token_accuracy": 0.7263119399547577,
"num_tokens": 922388746.0,
"step": 10010
},
{
"entropy": 1.1484375,
"epoch": 1.3694136941369415,
"grad_norm": 0.07844591758724494,
"learning_rate": 1.625686909961956e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.725129234790802,
"num_tokens": 923329842.0,
"step": 10020
},
{
"entropy": 1.14453125,
"epoch": 1.3707803744704115,
"grad_norm": 0.08743903239360103,
"learning_rate": 1.6221642947724392e-06,
"loss": 1.152,
"mean_token_accuracy": 0.726294893026352,
"num_tokens": 924206127.0,
"step": 10030
},
{
"entropy": 1.16171875,
"epoch": 1.3721470548038814,
"grad_norm": 0.07424200175591068,
"learning_rate": 1.6186416795829223e-06,
"loss": 1.1529,
"mean_token_accuracy": 0.7258797705173492,
"num_tokens": 925150765.0,
"step": 10040
},
{
"entropy": 1.14140625,
"epoch": 1.3735137351373514,
"grad_norm": 0.07929362382063793,
"learning_rate": 1.6151190643934058e-06,
"loss": 1.1372,
"mean_token_accuracy": 0.7273037195205688,
"num_tokens": 926087915.0,
"step": 10050
},
{
"entropy": 1.121484375,
"epoch": 1.3748804154708214,
"grad_norm": 0.07214111957405102,
"learning_rate": 1.6115964492038892e-06,
"loss": 1.1173,
"mean_token_accuracy": 0.731385350227356,
"num_tokens": 926998850.0,
"step": 10060
},
{
"entropy": 1.14765625,
"epoch": 1.3762470958042914,
"grad_norm": 0.07176541283502223,
"learning_rate": 1.6080738340143723e-06,
"loss": 1.1485,
"mean_token_accuracy": 0.7279722213745117,
"num_tokens": 927939577.0,
"step": 10070
},
{
"entropy": 1.100390625,
"epoch": 1.3776137761377614,
"grad_norm": 0.07679479035193196,
"learning_rate": 1.6045512188248559e-06,
"loss": 1.1073,
"mean_token_accuracy": 0.7343885600566864,
"num_tokens": 928810365.0,
"step": 10080
},
{
"entropy": 1.1484375,
"epoch": 1.3789804564712314,
"grad_norm": 0.07344414353054897,
"learning_rate": 1.601028603635339e-06,
"loss": 1.1487,
"mean_token_accuracy": 0.7267192780971528,
"num_tokens": 929705162.0,
"step": 10090
},
{
"entropy": 1.140625,
"epoch": 1.3803471368047013,
"grad_norm": 0.07535871185935702,
"learning_rate": 1.5975059884458225e-06,
"loss": 1.1555,
"mean_token_accuracy": 0.7248738825321197,
"num_tokens": 930678360.0,
"step": 10100
},
{
"entropy": 1.123828125,
"epoch": 1.3817138171381713,
"grad_norm": 0.07715564086763045,
"learning_rate": 1.5939833732563056e-06,
"loss": 1.1256,
"mean_token_accuracy": 0.7298265933990479,
"num_tokens": 931601996.0,
"step": 10110
},
{
"entropy": 1.14140625,
"epoch": 1.3830804974716413,
"grad_norm": 0.09629434881136109,
"learning_rate": 1.5904607580667888e-06,
"loss": 1.143,
"mean_token_accuracy": 0.7262736797332764,
"num_tokens": 932551505.0,
"step": 10120
},
{
"entropy": 1.1515625,
"epoch": 1.3844471778051113,
"grad_norm": 0.0819655985865002,
"learning_rate": 1.5869381428772723e-06,
"loss": 1.163,
"mean_token_accuracy": 0.7241319715976715,
"num_tokens": 933463791.0,
"step": 10130
},
{
"entropy": 1.1640625,
"epoch": 1.3858138581385813,
"grad_norm": 0.07874261024965427,
"learning_rate": 1.5834155276877554e-06,
"loss": 1.1586,
"mean_token_accuracy": 0.7247559130191803,
"num_tokens": 934370194.0,
"step": 10140
},
{
"entropy": 1.15,
"epoch": 1.3871805384720512,
"grad_norm": 0.07155203427586473,
"learning_rate": 1.5798929124982388e-06,
"loss": 1.1482,
"mean_token_accuracy": 0.7252091944217682,
"num_tokens": 935250557.0,
"step": 10150
},
{
"entropy": 1.115625,
"epoch": 1.3885472188055215,
"grad_norm": 0.07654976522787649,
"learning_rate": 1.5763702973087221e-06,
"loss": 1.1246,
"mean_token_accuracy": 0.7299403369426727,
"num_tokens": 936144286.0,
"step": 10160
},
{
"entropy": 1.097265625,
"epoch": 1.3899138991389914,
"grad_norm": 0.08117456354885338,
"learning_rate": 1.5728476821192054e-06,
"loss": 1.0998,
"mean_token_accuracy": 0.7376816272735596,
"num_tokens": 937054635.0,
"step": 10170
},
{
"entropy": 1.14765625,
"epoch": 1.3912805794724614,
"grad_norm": 0.07255379568056815,
"learning_rate": 1.5693250669296886e-06,
"loss": 1.1553,
"mean_token_accuracy": 0.7255343616008758,
"num_tokens": 938010446.0,
"step": 10180
},
{
"entropy": 1.10703125,
"epoch": 1.3926472598059314,
"grad_norm": 0.07760494911978381,
"learning_rate": 1.5658024517401721e-06,
"loss": 1.108,
"mean_token_accuracy": 0.7336413323879242,
"num_tokens": 938926013.0,
"step": 10190
},
{
"entropy": 1.099609375,
"epoch": 1.3940139401394014,
"grad_norm": 0.06656916663296975,
"learning_rate": 1.5622798365506552e-06,
"loss": 1.1019,
"mean_token_accuracy": 0.7361025214195251,
"num_tokens": 939867123.0,
"step": 10200
},
{
"entropy": 1.1375,
"epoch": 1.3953806204728714,
"grad_norm": 0.07264602779336123,
"learning_rate": 1.5587572213611388e-06,
"loss": 1.1279,
"mean_token_accuracy": 0.7304404616355896,
"num_tokens": 940784288.0,
"step": 10210
},
{
"entropy": 1.16484375,
"epoch": 1.3967473008063414,
"grad_norm": 0.07694866687493661,
"learning_rate": 1.555234606171622e-06,
"loss": 1.155,
"mean_token_accuracy": 0.7254554450511932,
"num_tokens": 941732009.0,
"step": 10220
},
{
"entropy": 1.1328125,
"epoch": 1.3981139811398113,
"grad_norm": 0.06670734446088837,
"learning_rate": 1.5517119909821052e-06,
"loss": 1.132,
"mean_token_accuracy": 0.7296274423599243,
"num_tokens": 942687281.0,
"step": 10230
},
{
"entropy": 1.144140625,
"epoch": 1.3994806614732813,
"grad_norm": 0.07651496960815012,
"learning_rate": 1.5481893757925886e-06,
"loss": 1.1455,
"mean_token_accuracy": 0.7271680176258087,
"num_tokens": 943633306.0,
"step": 10240
},
{
"entropy": 1.14453125,
"epoch": 1.4008473418067515,
"grad_norm": 0.08232934110007036,
"learning_rate": 1.544666760603072e-06,
"loss": 1.1418,
"mean_token_accuracy": 0.7262127101421356,
"num_tokens": 944512974.0,
"step": 10250
},
{
"entropy": 1.17421875,
"epoch": 1.4022140221402215,
"grad_norm": 0.07749387929279603,
"learning_rate": 1.541144145413555e-06,
"loss": 1.1872,
"mean_token_accuracy": 0.7186465680599212,
"num_tokens": 945460175.0,
"step": 10260
},
{
"entropy": 1.16953125,
"epoch": 1.4035807024736915,
"grad_norm": 0.07062219338588437,
"learning_rate": 1.5376215302240386e-06,
"loss": 1.1736,
"mean_token_accuracy": 0.7238304376602173,
"num_tokens": 946446899.0,
"step": 10270
},
{
"entropy": 1.12421875,
"epoch": 1.4049473828071615,
"grad_norm": 0.11465494305463382,
"learning_rate": 1.5340989150345217e-06,
"loss": 1.1303,
"mean_token_accuracy": 0.7300590574741364,
"num_tokens": 947379575.0,
"step": 10280
},
{
"entropy": 1.1015625,
"epoch": 1.4063140631406315,
"grad_norm": 0.07386757666824638,
"learning_rate": 1.5305762998450048e-06,
"loss": 1.112,
"mean_token_accuracy": 0.7338379085063934,
"num_tokens": 948285980.0,
"step": 10290
},
{
"entropy": 1.181640625,
"epoch": 1.4076807434741014,
"grad_norm": 0.08264493802507944,
"learning_rate": 1.5270536846554884e-06,
"loss": 1.1856,
"mean_token_accuracy": 0.7201814711093902,
"num_tokens": 949204652.0,
"step": 10300
},
{
"entropy": 1.1578125,
"epoch": 1.4090474238075714,
"grad_norm": 0.08970861561395745,
"learning_rate": 1.5235310694659717e-06,
"loss": 1.1598,
"mean_token_accuracy": 0.7226113617420197,
"num_tokens": 950145164.0,
"step": 10310
},
{
"entropy": 1.10546875,
"epoch": 1.4104141041410414,
"grad_norm": 0.07517765076970682,
"learning_rate": 1.5200084542764548e-06,
"loss": 1.0907,
"mean_token_accuracy": 0.737215393781662,
"num_tokens": 951084582.0,
"step": 10320
},
{
"entropy": 1.1296875,
"epoch": 1.4117807844745114,
"grad_norm": 0.07419065923859601,
"learning_rate": 1.5164858390869384e-06,
"loss": 1.131,
"mean_token_accuracy": 0.7282682716846466,
"num_tokens": 952003428.0,
"step": 10330
},
{
"entropy": 1.21171875,
"epoch": 1.4131474648079814,
"grad_norm": 0.07687474999203672,
"learning_rate": 1.5129632238974215e-06,
"loss": 1.215,
"mean_token_accuracy": 0.7156873106956482,
"num_tokens": 952891571.0,
"step": 10340
},
{
"entropy": 1.13359375,
"epoch": 1.4145141451414514,
"grad_norm": 0.06705005352371814,
"learning_rate": 1.509440608707905e-06,
"loss": 1.1287,
"mean_token_accuracy": 0.7291148364543915,
"num_tokens": 953841805.0,
"step": 10350
},
{
"entropy": 1.1921875,
"epoch": 1.4158808254749213,
"grad_norm": 0.07540213479873127,
"learning_rate": 1.5059179935183882e-06,
"loss": 1.2002,
"mean_token_accuracy": 0.7173850238323212,
"num_tokens": 954757093.0,
"step": 10360
},
{
"entropy": 1.14921875,
"epoch": 1.4172475058083913,
"grad_norm": 0.07587884750391038,
"learning_rate": 1.5023953783288713e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.7259556472301483,
"num_tokens": 955688450.0,
"step": 10370
},
{
"entropy": 1.1234375,
"epoch": 1.4186141861418613,
"grad_norm": 0.07091964697215794,
"learning_rate": 1.4988727631393549e-06,
"loss": 1.1253,
"mean_token_accuracy": 0.7284778177738189,
"num_tokens": 956599499.0,
"step": 10380
},
{
"entropy": 1.14765625,
"epoch": 1.4199808664753313,
"grad_norm": 0.2337602756788078,
"learning_rate": 1.495350147949838e-06,
"loss": 1.1508,
"mean_token_accuracy": 0.7268372058868409,
"num_tokens": 957556773.0,
"step": 10390
},
{
"entropy": 1.116015625,
"epoch": 1.4213475468088015,
"grad_norm": 0.07788060377409708,
"learning_rate": 1.4918275327603213e-06,
"loss": 1.1184,
"mean_token_accuracy": 0.7324895322322845,
"num_tokens": 958434459.0,
"step": 10400
},
{
"entropy": 1.10625,
"epoch": 1.4227142271422715,
"grad_norm": 0.06979383225662257,
"learning_rate": 1.4883049175708047e-06,
"loss": 1.1147,
"mean_token_accuracy": 0.7329348087310791,
"num_tokens": 959305209.0,
"step": 10410
},
{
"entropy": 1.149609375,
"epoch": 1.4240809074757415,
"grad_norm": 0.07183086573939858,
"learning_rate": 1.484782302381288e-06,
"loss": 1.138,
"mean_token_accuracy": 0.7274621784687042,
"num_tokens": 960231022.0,
"step": 10420
},
{
"entropy": 1.15390625,
"epoch": 1.4254475878092114,
"grad_norm": 0.08313997414969168,
"learning_rate": 1.4812596871917711e-06,
"loss": 1.1605,
"mean_token_accuracy": 0.7254222333431244,
"num_tokens": 961131511.0,
"step": 10430
},
{
"entropy": 1.11953125,
"epoch": 1.4268142681426814,
"grad_norm": 0.07499489829204878,
"learning_rate": 1.4777370720022547e-06,
"loss": 1.1146,
"mean_token_accuracy": 0.7319073617458344,
"num_tokens": 962047298.0,
"step": 10440
},
{
"entropy": 1.120703125,
"epoch": 1.4281809484761514,
"grad_norm": 0.0805119532769934,
"learning_rate": 1.4742144568127378e-06,
"loss": 1.1179,
"mean_token_accuracy": 0.731139749288559,
"num_tokens": 963000680.0,
"step": 10450
},
{
"entropy": 1.1046875,
"epoch": 1.4295476288096214,
"grad_norm": 0.07654400609957551,
"learning_rate": 1.4706918416232213e-06,
"loss": 1.0963,
"mean_token_accuracy": 0.7366066455841065,
"num_tokens": 963910634.0,
"step": 10460
},
{
"entropy": 1.13359375,
"epoch": 1.4309143091430914,
"grad_norm": 0.07039244540785394,
"learning_rate": 1.4671692264337045e-06,
"loss": 1.1257,
"mean_token_accuracy": 0.7303950488567352,
"num_tokens": 964864834.0,
"step": 10470
},
{
"entropy": 1.121484375,
"epoch": 1.4322809894765614,
"grad_norm": 0.0695301234194398,
"learning_rate": 1.4636466112441878e-06,
"loss": 1.1211,
"mean_token_accuracy": 0.7297424674034119,
"num_tokens": 965816016.0,
"step": 10480
},
{
"entropy": 1.119921875,
"epoch": 1.4336476698100316,
"grad_norm": 0.07317379920931785,
"learning_rate": 1.4601239960546711e-06,
"loss": 1.1177,
"mean_token_accuracy": 0.7317755997180939,
"num_tokens": 966734698.0,
"step": 10490
},
{
"entropy": 1.146875,
"epoch": 1.4350143501435015,
"grad_norm": 0.07920518719097516,
"learning_rate": 1.4566013808651545e-06,
"loss": 1.1408,
"mean_token_accuracy": 0.7277876675128937,
"num_tokens": 967702846.0,
"step": 10500
},
{
"entropy": 1.10390625,
"epoch": 1.4363810304769715,
"grad_norm": 0.13790731653123317,
"learning_rate": 1.4530787656756376e-06,
"loss": 1.1113,
"mean_token_accuracy": 0.7337267398834229,
"num_tokens": 968655185.0,
"step": 10510
},
{
"entropy": 1.12734375,
"epoch": 1.4377477108104415,
"grad_norm": 0.08000057418411434,
"learning_rate": 1.4495561504861211e-06,
"loss": 1.1188,
"mean_token_accuracy": 0.7305463790893555,
"num_tokens": 969625560.0,
"step": 10520
},
{
"entropy": 1.143359375,
"epoch": 1.4391143911439115,
"grad_norm": 0.06858250704579313,
"learning_rate": 1.4460335352966043e-06,
"loss": 1.1444,
"mean_token_accuracy": 0.7289994359016418,
"num_tokens": 970536328.0,
"step": 10530
},
{
"entropy": 1.18359375,
"epoch": 1.4404810714773815,
"grad_norm": 0.06977961851222206,
"learning_rate": 1.4425109201070876e-06,
"loss": 1.1814,
"mean_token_accuracy": 0.7200577855110168,
"num_tokens": 971542224.0,
"step": 10540
},
{
"entropy": 1.1546875,
"epoch": 1.4418477518108515,
"grad_norm": 0.08917250504689657,
"learning_rate": 1.438988304917571e-06,
"loss": 1.164,
"mean_token_accuracy": 0.7223708748817443,
"num_tokens": 972483619.0,
"step": 10550
},
{
"entropy": 1.13671875,
"epoch": 1.4432144321443214,
"grad_norm": 0.08316913589080716,
"learning_rate": 1.4354656897280543e-06,
"loss": 1.1426,
"mean_token_accuracy": 0.726893424987793,
"num_tokens": 973418101.0,
"step": 10560
},
{
"entropy": 1.13203125,
"epoch": 1.4445811124777914,
"grad_norm": 0.07573805940109554,
"learning_rate": 1.4319430745385376e-06,
"loss": 1.1307,
"mean_token_accuracy": 0.7290737986564636,
"num_tokens": 974333475.0,
"step": 10570
},
{
"entropy": 1.146875,
"epoch": 1.4459477928112614,
"grad_norm": 0.07072803468759951,
"learning_rate": 1.428420459349021e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.7275378823280334,
"num_tokens": 975250053.0,
"step": 10580
},
{
"entropy": 1.0859375,
"epoch": 1.4473144731447314,
"grad_norm": 0.07840051546334531,
"learning_rate": 1.424897844159504e-06,
"loss": 1.0893,
"mean_token_accuracy": 0.7369271397590638,
"num_tokens": 976170564.0,
"step": 10590
},
{
"entropy": 1.1265625,
"epoch": 1.4486811534782014,
"grad_norm": 0.08177288931987244,
"learning_rate": 1.4213752289699876e-06,
"loss": 1.1316,
"mean_token_accuracy": 0.7298027455806733,
"num_tokens": 977076464.0,
"step": 10600
},
{
"entropy": 1.120703125,
"epoch": 1.4500478338116713,
"grad_norm": 0.06587957255240641,
"learning_rate": 1.4178526137804707e-06,
"loss": 1.1226,
"mean_token_accuracy": 0.7304437756538391,
"num_tokens": 977974328.0,
"step": 10610
},
{
"entropy": 1.118359375,
"epoch": 1.4514145141451413,
"grad_norm": 0.07605562925446858,
"learning_rate": 1.4143299985909538e-06,
"loss": 1.1121,
"mean_token_accuracy": 0.7338150441646576,
"num_tokens": 978866088.0,
"step": 10620
},
{
"entropy": 1.119921875,
"epoch": 1.4527811944786113,
"grad_norm": 0.07771369109309847,
"learning_rate": 1.4108073834014374e-06,
"loss": 1.1108,
"mean_token_accuracy": 0.7330794632434845,
"num_tokens": 979814950.0,
"step": 10630
},
{
"entropy": 1.109375,
"epoch": 1.4541478748120815,
"grad_norm": 0.07515411946356766,
"learning_rate": 1.4072847682119205e-06,
"loss": 1.1159,
"mean_token_accuracy": 0.7327601850032807,
"num_tokens": 980760948.0,
"step": 10640
},
{
"entropy": 1.11875,
"epoch": 1.4555145551455515,
"grad_norm": 0.07141844863717581,
"learning_rate": 1.4037621530224039e-06,
"loss": 1.1093,
"mean_token_accuracy": 0.7334080755710601,
"num_tokens": 981737348.0,
"step": 10650
},
{
"entropy": 1.1046875,
"epoch": 1.4568812354790215,
"grad_norm": 0.07383610050914025,
"learning_rate": 1.4002395378328872e-06,
"loss": 1.1078,
"mean_token_accuracy": 0.7322146832942963,
"num_tokens": 982695749.0,
"step": 10660
},
{
"entropy": 1.094140625,
"epoch": 1.4582479158124915,
"grad_norm": 0.075338344878463,
"learning_rate": 1.3967169226433705e-06,
"loss": 1.0903,
"mean_token_accuracy": 0.737960159778595,
"num_tokens": 983614796.0,
"step": 10670
},
{
"entropy": 1.153125,
"epoch": 1.4596145961459615,
"grad_norm": 0.07576000219067015,
"learning_rate": 1.393194307453854e-06,
"loss": 1.1506,
"mean_token_accuracy": 0.7255341470241546,
"num_tokens": 984551709.0,
"step": 10680
},
{
"entropy": 1.165625,
"epoch": 1.4609812764794314,
"grad_norm": 0.07078380862011883,
"learning_rate": 1.3896716922643372e-06,
"loss": 1.1614,
"mean_token_accuracy": 0.7234556674957275,
"num_tokens": 985454993.0,
"step": 10690
},
{
"entropy": 1.11796875,
"epoch": 1.4623479568129014,
"grad_norm": 0.0728252868239956,
"learning_rate": 1.3861490770748203e-06,
"loss": 1.1196,
"mean_token_accuracy": 0.7322444975376129,
"num_tokens": 986391499.0,
"step": 10700
},
{
"entropy": 1.134375,
"epoch": 1.4637146371463714,
"grad_norm": 0.07785532639876405,
"learning_rate": 1.3826264618853039e-06,
"loss": 1.1416,
"mean_token_accuracy": 0.7255606830120087,
"num_tokens": 987263519.0,
"step": 10710
},
{
"entropy": 1.16328125,
"epoch": 1.4650813174798414,
"grad_norm": 0.07217763346235927,
"learning_rate": 1.379103846695787e-06,
"loss": 1.1699,
"mean_token_accuracy": 0.7228078663349151,
"num_tokens": 988255038.0,
"step": 10720
},
{
"entropy": 1.1390625,
"epoch": 1.4664479978133116,
"grad_norm": 0.0742981686556971,
"learning_rate": 1.3755812315062703e-06,
"loss": 1.1532,
"mean_token_accuracy": 0.7255650341510773,
"num_tokens": 989167746.0,
"step": 10730
},
{
"entropy": 1.12578125,
"epoch": 1.4678146781467816,
"grad_norm": 0.07355784045854705,
"learning_rate": 1.3720586163167537e-06,
"loss": 1.1329,
"mean_token_accuracy": 0.7283282995223999,
"num_tokens": 990084172.0,
"step": 10740
},
{
"entropy": 1.16015625,
"epoch": 1.4691813584802516,
"grad_norm": 0.11471696246564854,
"learning_rate": 1.368536001127237e-06,
"loss": 1.1732,
"mean_token_accuracy": 0.7232391655445098,
"num_tokens": 990977216.0,
"step": 10750
},
{
"entropy": 1.109375,
"epoch": 1.4705480388137215,
"grad_norm": 0.07141034290500774,
"learning_rate": 1.3650133859377201e-06,
"loss": 1.0968,
"mean_token_accuracy": 0.7337201118469239,
"num_tokens": 991891997.0,
"step": 10760
},
{
"entropy": 1.1359375,
"epoch": 1.4719147191471915,
"grad_norm": 0.07917192497986676,
"learning_rate": 1.3614907707482037e-06,
"loss": 1.1358,
"mean_token_accuracy": 0.728352838754654,
"num_tokens": 992789231.0,
"step": 10770
},
{
"entropy": 1.20234375,
"epoch": 1.4732813994806615,
"grad_norm": 0.07509926541911408,
"learning_rate": 1.3579681555586868e-06,
"loss": 1.1903,
"mean_token_accuracy": 0.7189868986606598,
"num_tokens": 993739364.0,
"step": 10780
},
{
"entropy": 1.14921875,
"epoch": 1.4746480798141315,
"grad_norm": 0.06521683338364788,
"learning_rate": 1.3544455403691703e-06,
"loss": 1.152,
"mean_token_accuracy": 0.7263393044471741,
"num_tokens": 994688957.0,
"step": 10790
},
{
"entropy": 1.1171875,
"epoch": 1.4760147601476015,
"grad_norm": 0.08088125419728857,
"learning_rate": 1.3509229251796535e-06,
"loss": 1.1048,
"mean_token_accuracy": 0.7350934863090515,
"num_tokens": 995572150.0,
"step": 10800
},
{
"entropy": 1.141796875,
"epoch": 1.4773814404810715,
"grad_norm": 0.07007108171923251,
"learning_rate": 1.3474003099901368e-06,
"loss": 1.1354,
"mean_token_accuracy": 0.7292545139789581,
"num_tokens": 996471230.0,
"step": 10810
},
{
"entropy": 1.1125,
"epoch": 1.4787481208145414,
"grad_norm": 0.0735364486313983,
"learning_rate": 1.3438776948006201e-06,
"loss": 1.1029,
"mean_token_accuracy": 0.7356233894824982,
"num_tokens": 997378119.0,
"step": 10820
},
{
"entropy": 1.082421875,
"epoch": 1.4801148011480114,
"grad_norm": 0.07121227758290609,
"learning_rate": 1.3403550796111035e-06,
"loss": 1.0889,
"mean_token_accuracy": 0.7380770325660706,
"num_tokens": 998303380.0,
"step": 10830
},
{
"entropy": 1.15234375,
"epoch": 1.4814814814814814,
"grad_norm": 0.06959150710914708,
"learning_rate": 1.3368324644215866e-06,
"loss": 1.1461,
"mean_token_accuracy": 0.726474004983902,
"num_tokens": 999225085.0,
"step": 10840
},
{
"entropy": 1.083203125,
"epoch": 1.4828481618149514,
"grad_norm": 0.06856756393950367,
"learning_rate": 1.3333098492320701e-06,
"loss": 1.0825,
"mean_token_accuracy": 0.7378836214542389,
"num_tokens": 1000160882.0,
"step": 10850
},
{
"entropy": 1.1390625,
"epoch": 1.4842148421484214,
"grad_norm": 0.0794391129824923,
"learning_rate": 1.3297872340425533e-06,
"loss": 1.1291,
"mean_token_accuracy": 0.7283900439739227,
"num_tokens": 1001058706.0,
"step": 10860
},
{
"entropy": 1.14921875,
"epoch": 1.4855815224818913,
"grad_norm": 0.07431846526357498,
"learning_rate": 1.3262646188530364e-06,
"loss": 1.1591,
"mean_token_accuracy": 0.7253896713256835,
"num_tokens": 1002025389.0,
"step": 10870
},
{
"entropy": 1.11640625,
"epoch": 1.4869482028153616,
"grad_norm": 0.0727711971245476,
"learning_rate": 1.32274200366352e-06,
"loss": 1.1232,
"mean_token_accuracy": 0.731710147857666,
"num_tokens": 1002947444.0,
"step": 10880
},
{
"entropy": 1.1046875,
"epoch": 1.4883148831488315,
"grad_norm": 0.06887622651011735,
"learning_rate": 1.319219388474003e-06,
"loss": 1.1137,
"mean_token_accuracy": 0.7342944800853729,
"num_tokens": 1003827927.0,
"step": 10890
},
{
"entropy": 1.165625,
"epoch": 1.4896815634823015,
"grad_norm": 0.07549223986001444,
"learning_rate": 1.3156967732844866e-06,
"loss": 1.1669,
"mean_token_accuracy": 0.7236524164676666,
"num_tokens": 1004761314.0,
"step": 10900
},
{
"entropy": 1.0953125,
"epoch": 1.4910482438157715,
"grad_norm": 0.07477248169800077,
"learning_rate": 1.31217415809497e-06,
"loss": 1.0912,
"mean_token_accuracy": 0.7374552249908447,
"num_tokens": 1005703529.0,
"step": 10910
},
{
"entropy": 1.1609375,
"epoch": 1.4924149241492415,
"grad_norm": 0.07323967503658557,
"learning_rate": 1.308651542905453e-06,
"loss": 1.1763,
"mean_token_accuracy": 0.7189733207225799,
"num_tokens": 1006683078.0,
"step": 10920
},
{
"entropy": 1.118359375,
"epoch": 1.4937816044827115,
"grad_norm": 0.077751786027873,
"learning_rate": 1.3051289277159366e-06,
"loss": 1.119,
"mean_token_accuracy": 0.7301354050636292,
"num_tokens": 1007585241.0,
"step": 10930
},
{
"entropy": 1.132421875,
"epoch": 1.4951482848161814,
"grad_norm": 0.07518315271365333,
"learning_rate": 1.3016063125264197e-06,
"loss": 1.1399,
"mean_token_accuracy": 0.7298017919063569,
"num_tokens": 1008478476.0,
"step": 10940
},
{
"entropy": 1.15390625,
"epoch": 1.4965149651496514,
"grad_norm": 0.07306068520971457,
"learning_rate": 1.2980836973369029e-06,
"loss": 1.152,
"mean_token_accuracy": 0.7239294230937958,
"num_tokens": 1009373252.0,
"step": 10950
},
{
"entropy": 1.130859375,
"epoch": 1.4978816454831214,
"grad_norm": 0.07728429935289949,
"learning_rate": 1.2945610821473864e-06,
"loss": 1.1349,
"mean_token_accuracy": 0.7299492299556732,
"num_tokens": 1010255380.0,
"step": 10960
},
{
"entropy": 1.096875,
"epoch": 1.4992483258165916,
"grad_norm": 0.07325618689034473,
"learning_rate": 1.2910384669578695e-06,
"loss": 1.0941,
"mean_token_accuracy": 0.7384086072444915,
"num_tokens": 1011183449.0,
"step": 10970
},
{
"entropy": 1.1,
"epoch": 1.5006150061500616,
"grad_norm": 0.07787235307793541,
"learning_rate": 1.2875158517683529e-06,
"loss": 1.103,
"mean_token_accuracy": 0.7339354455471039,
"num_tokens": 1012078107.0,
"step": 10980
},
{
"entropy": 1.159375,
"epoch": 1.5019816864835316,
"grad_norm": 0.08398501837638662,
"learning_rate": 1.2839932365788362e-06,
"loss": 1.1556,
"mean_token_accuracy": 0.7248615264892578,
"num_tokens": 1012995560.0,
"step": 10990
},
{
"entropy": 1.132421875,
"epoch": 1.5033483668170016,
"grad_norm": 0.07674708846664523,
"learning_rate": 1.2804706213893195e-06,
"loss": 1.1398,
"mean_token_accuracy": 0.7284366190433502,
"num_tokens": 1013935624.0,
"step": 11000
},
{
"entropy": 1.17109375,
"epoch": 1.5047150471504716,
"grad_norm": 0.0729720473613557,
"learning_rate": 1.2769480061998029e-06,
"loss": 1.1819,
"mean_token_accuracy": 0.7214305341243744,
"num_tokens": 1014862406.0,
"step": 11010
},
{
"entropy": 1.123046875,
"epoch": 1.5060817274839415,
"grad_norm": 0.07182330017290349,
"learning_rate": 1.2734253910102862e-06,
"loss": 1.1245,
"mean_token_accuracy": 0.7319157361984253,
"num_tokens": 1015821840.0,
"step": 11020
},
{
"entropy": 1.1453125,
"epoch": 1.5074484078174115,
"grad_norm": 0.07275496917222793,
"learning_rate": 1.2699027758207693e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.726570212841034,
"num_tokens": 1016739059.0,
"step": 11030
},
{
"entropy": 1.1484375,
"epoch": 1.5088150881508815,
"grad_norm": 0.07440094855130473,
"learning_rate": 1.2663801606312529e-06,
"loss": 1.1413,
"mean_token_accuracy": 0.7257960498332977,
"num_tokens": 1017732695.0,
"step": 11040
},
{
"entropy": 1.11796875,
"epoch": 1.5101817684843515,
"grad_norm": 0.07623657816355991,
"learning_rate": 1.262857545441736e-06,
"loss": 1.1141,
"mean_token_accuracy": 0.7335702002048492,
"num_tokens": 1018608385.0,
"step": 11050
},
{
"entropy": 1.14453125,
"epoch": 1.5115484488178215,
"grad_norm": 0.07232335362327114,
"learning_rate": 1.2593349302522193e-06,
"loss": 1.149,
"mean_token_accuracy": 0.7260921835899353,
"num_tokens": 1019521726.0,
"step": 11060
},
{
"entropy": 1.123046875,
"epoch": 1.5129151291512914,
"grad_norm": 0.07363996824929545,
"learning_rate": 1.2558123150627027e-06,
"loss": 1.1328,
"mean_token_accuracy": 0.7288338601589203,
"num_tokens": 1020459793.0,
"step": 11070
},
{
"entropy": 1.17265625,
"epoch": 1.5142818094847614,
"grad_norm": 0.08256372781495609,
"learning_rate": 1.252289699873186e-06,
"loss": 1.1847,
"mean_token_accuracy": 0.7181225895881653,
"num_tokens": 1021389370.0,
"step": 11080
},
{
"entropy": 1.1703125,
"epoch": 1.5156484898182314,
"grad_norm": 0.07997355548533282,
"learning_rate": 1.2487670846836693e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.7221483945846557,
"num_tokens": 1022323761.0,
"step": 11090
},
{
"entropy": 1.21328125,
"epoch": 1.5170151701517014,
"grad_norm": 0.08420302289105287,
"learning_rate": 1.2452444694941527e-06,
"loss": 1.2174,
"mean_token_accuracy": 0.7130800902843475,
"num_tokens": 1023215077.0,
"step": 11100
},
{
"entropy": 1.155078125,
"epoch": 1.5183818504851714,
"grad_norm": 0.07190902935947885,
"learning_rate": 1.2417218543046358e-06,
"loss": 1.1515,
"mean_token_accuracy": 0.725620549917221,
"num_tokens": 1024137735.0,
"step": 11110
},
{
"entropy": 1.12265625,
"epoch": 1.5197485308186414,
"grad_norm": 0.07409428719692752,
"learning_rate": 1.2381992391151191e-06,
"loss": 1.1245,
"mean_token_accuracy": 0.7315682530403137,
"num_tokens": 1025047579.0,
"step": 11120
},
{
"entropy": 1.126171875,
"epoch": 1.5211152111521116,
"grad_norm": 0.07158491675811858,
"learning_rate": 1.2346766239256025e-06,
"loss": 1.1412,
"mean_token_accuracy": 0.7269042074680329,
"num_tokens": 1025997637.0,
"step": 11130
},
{
"entropy": 1.10390625,
"epoch": 1.5224818914855816,
"grad_norm": 0.0704018525761512,
"learning_rate": 1.2311540087360858e-06,
"loss": 1.109,
"mean_token_accuracy": 0.7343244135379792,
"num_tokens": 1026960810.0,
"step": 11140
},
{
"entropy": 1.121875,
"epoch": 1.5238485718190515,
"grad_norm": 0.09566309323077518,
"learning_rate": 1.2276313935465691e-06,
"loss": 1.1108,
"mean_token_accuracy": 0.734355241060257,
"num_tokens": 1027877854.0,
"step": 11150
},
{
"entropy": 1.0875,
"epoch": 1.5252152521525215,
"grad_norm": 0.06954603684060745,
"learning_rate": 1.2241087783570525e-06,
"loss": 1.0872,
"mean_token_accuracy": 0.737934023141861,
"num_tokens": 1028805396.0,
"step": 11160
},
{
"entropy": 1.140234375,
"epoch": 1.5265819324859915,
"grad_norm": 0.07101706282651002,
"learning_rate": 1.2205861631675358e-06,
"loss": 1.1411,
"mean_token_accuracy": 0.7264037609100342,
"num_tokens": 1029738415.0,
"step": 11170
},
{
"entropy": 1.16796875,
"epoch": 1.5279486128194615,
"grad_norm": 0.0781333415233322,
"learning_rate": 1.217063547978019e-06,
"loss": 1.1652,
"mean_token_accuracy": 0.7224674046039581,
"num_tokens": 1030646865.0,
"step": 11180
},
{
"entropy": 1.1328125,
"epoch": 1.5293152931529317,
"grad_norm": 0.07752242416811402,
"learning_rate": 1.2135409327885023e-06,
"loss": 1.1389,
"mean_token_accuracy": 0.7273630142211914,
"num_tokens": 1031564697.0,
"step": 11190
},
{
"entropy": 1.109375,
"epoch": 1.5306819734864017,
"grad_norm": 0.07214831524021938,
"learning_rate": 1.2100183175989856e-06,
"loss": 1.1014,
"mean_token_accuracy": 0.7354528307914734,
"num_tokens": 1032464705.0,
"step": 11200
},
{
"entropy": 1.09609375,
"epoch": 1.5320486538198717,
"grad_norm": 0.07134659243709798,
"learning_rate": 1.206495702409469e-06,
"loss": 1.1006,
"mean_token_accuracy": 0.7351302862167358,
"num_tokens": 1033363698.0,
"step": 11210
},
{
"entropy": 1.07578125,
"epoch": 1.5334153341533416,
"grad_norm": 0.07467617589966043,
"learning_rate": 1.202973087219952e-06,
"loss": 1.0745,
"mean_token_accuracy": 0.7386326372623444,
"num_tokens": 1034287145.0,
"step": 11220
},
{
"entropy": 1.08828125,
"epoch": 1.5347820144868116,
"grad_norm": 0.0719321500899256,
"learning_rate": 1.1994504720304354e-06,
"loss": 1.0994,
"mean_token_accuracy": 0.7345064878463745,
"num_tokens": 1035237499.0,
"step": 11230
},
{
"entropy": 1.090234375,
"epoch": 1.5361486948202816,
"grad_norm": 0.08230972765608717,
"learning_rate": 1.1959278568409187e-06,
"loss": 1.0889,
"mean_token_accuracy": 0.7364981472492218,
"num_tokens": 1036189146.0,
"step": 11240
},
{
"entropy": 1.1109375,
"epoch": 1.5375153751537516,
"grad_norm": 0.18127287862733923,
"learning_rate": 1.192405241651402e-06,
"loss": 1.1126,
"mean_token_accuracy": 0.7326433479785919,
"num_tokens": 1037111626.0,
"step": 11250
},
{
"entropy": 1.12890625,
"epoch": 1.5388820554872216,
"grad_norm": 0.07983634040995025,
"learning_rate": 1.1888826264618854e-06,
"loss": 1.1289,
"mean_token_accuracy": 0.7295945346355438,
"num_tokens": 1038023066.0,
"step": 11260
},
{
"entropy": 1.109375,
"epoch": 1.5402487358206916,
"grad_norm": 0.0661955053448444,
"learning_rate": 1.1853600112723687e-06,
"loss": 1.1085,
"mean_token_accuracy": 0.7337481319904328,
"num_tokens": 1038963483.0,
"step": 11270
},
{
"entropy": 1.10703125,
"epoch": 1.5416154161541615,
"grad_norm": 0.07891272842460145,
"learning_rate": 1.181837396082852e-06,
"loss": 1.1036,
"mean_token_accuracy": 0.7354430735111237,
"num_tokens": 1039900364.0,
"step": 11280
},
{
"entropy": 1.103125,
"epoch": 1.5429820964876315,
"grad_norm": 0.06868249585215337,
"learning_rate": 1.1783147808933352e-06,
"loss": 1.1017,
"mean_token_accuracy": 0.7357358992099762,
"num_tokens": 1040829063.0,
"step": 11290
},
{
"entropy": 1.20625,
"epoch": 1.5443487768211015,
"grad_norm": 0.07610655114016612,
"learning_rate": 1.1747921657038185e-06,
"loss": 1.2107,
"mean_token_accuracy": 0.7149346113204956,
"num_tokens": 1041751052.0,
"step": 11300
},
{
"entropy": 1.092578125,
"epoch": 1.5457154571545715,
"grad_norm": 0.06795281318914435,
"learning_rate": 1.1712695505143019e-06,
"loss": 1.0936,
"mean_token_accuracy": 0.7360093116760253,
"num_tokens": 1042647676.0,
"step": 11310
},
{
"entropy": 1.146875,
"epoch": 1.5470821374880415,
"grad_norm": 0.07658842934905705,
"learning_rate": 1.1677469353247852e-06,
"loss": 1.1415,
"mean_token_accuracy": 0.7272807359695435,
"num_tokens": 1043516583.0,
"step": 11320
},
{
"entropy": 1.154296875,
"epoch": 1.5484488178215114,
"grad_norm": 0.0690257267902915,
"learning_rate": 1.1642243201352685e-06,
"loss": 1.1621,
"mean_token_accuracy": 0.7222444534301757,
"num_tokens": 1044470985.0,
"step": 11330
},
{
"entropy": 1.11015625,
"epoch": 1.5498154981549814,
"grad_norm": 0.0814552735482428,
"learning_rate": 1.1607017049457519e-06,
"loss": 1.1109,
"mean_token_accuracy": 0.7319199800491333,
"num_tokens": 1045407715.0,
"step": 11340
},
{
"entropy": 1.1515625,
"epoch": 1.5511821784884514,
"grad_norm": 0.11261812513728818,
"learning_rate": 1.1571790897562352e-06,
"loss": 1.1516,
"mean_token_accuracy": 0.7258908987045288,
"num_tokens": 1046300953.0,
"step": 11350
},
{
"entropy": 1.1234375,
"epoch": 1.5525488588219214,
"grad_norm": 0.07892363191440852,
"learning_rate": 1.1536564745667183e-06,
"loss": 1.1134,
"mean_token_accuracy": 0.7321428716182709,
"num_tokens": 1047206230.0,
"step": 11360
},
{
"entropy": 1.1328125,
"epoch": 1.5539155391553916,
"grad_norm": 0.07700712950658295,
"learning_rate": 1.1501338593772017e-06,
"loss": 1.1253,
"mean_token_accuracy": 0.7304822444915772,
"num_tokens": 1048108612.0,
"step": 11370
},
{
"entropy": 1.14609375,
"epoch": 1.5552822194888616,
"grad_norm": 0.14694454901427123,
"learning_rate": 1.146611244187685e-06,
"loss": 1.1452,
"mean_token_accuracy": 0.7278164207935334,
"num_tokens": 1049057878.0,
"step": 11380
},
{
"entropy": 1.139453125,
"epoch": 1.5566488998223316,
"grad_norm": 0.0776039762852319,
"learning_rate": 1.1430886289981683e-06,
"loss": 1.144,
"mean_token_accuracy": 0.7247417330741882,
"num_tokens": 1049975577.0,
"step": 11390
},
{
"entropy": 1.098828125,
"epoch": 1.5580155801558015,
"grad_norm": 0.07833984466411759,
"learning_rate": 1.1395660138086517e-06,
"loss": 1.0878,
"mean_token_accuracy": 0.7370636224746704,
"num_tokens": 1050838742.0,
"step": 11400
},
{
"entropy": 1.11328125,
"epoch": 1.5593822604892715,
"grad_norm": 0.07146925642411352,
"learning_rate": 1.136043398619135e-06,
"loss": 1.1168,
"mean_token_accuracy": 0.7325511932373047,
"num_tokens": 1051748442.0,
"step": 11410
},
{
"entropy": 1.16953125,
"epoch": 1.5607489408227415,
"grad_norm": 0.0771290351504959,
"learning_rate": 1.1325207834296184e-06,
"loss": 1.1649,
"mean_token_accuracy": 0.7233783900737762,
"num_tokens": 1052690851.0,
"step": 11420
},
{
"entropy": 1.103125,
"epoch": 1.5621156211562117,
"grad_norm": 0.06934882673427788,
"learning_rate": 1.1289981682401017e-06,
"loss": 1.0954,
"mean_token_accuracy": 0.7355590224266052,
"num_tokens": 1053577690.0,
"step": 11430
},
{
"entropy": 1.1203125,
"epoch": 1.5634823014896817,
"grad_norm": 0.06867483654131724,
"learning_rate": 1.1254755530505848e-06,
"loss": 1.1147,
"mean_token_accuracy": 0.7325948476791382,
"num_tokens": 1054497466.0,
"step": 11440
},
{
"entropy": 1.14609375,
"epoch": 1.5648489818231517,
"grad_norm": 0.07068309278249604,
"learning_rate": 1.1219529378610681e-06,
"loss": 1.1497,
"mean_token_accuracy": 0.7255378484725952,
"num_tokens": 1055417792.0,
"step": 11450
},
{
"entropy": 1.1453125,
"epoch": 1.5662156621566217,
"grad_norm": 0.07232489294331423,
"learning_rate": 1.1184303226715515e-06,
"loss": 1.1486,
"mean_token_accuracy": 0.7275538504123688,
"num_tokens": 1056359927.0,
"step": 11460
},
{
"entropy": 1.15234375,
"epoch": 1.5675823424900917,
"grad_norm": 0.07544423487630288,
"learning_rate": 1.1149077074820346e-06,
"loss": 1.1481,
"mean_token_accuracy": 0.7255580246448516,
"num_tokens": 1057290213.0,
"step": 11470
},
{
"entropy": 1.1984375,
"epoch": 1.5689490228235616,
"grad_norm": 0.08769663605981054,
"learning_rate": 1.111385092292518e-06,
"loss": 1.1962,
"mean_token_accuracy": 0.7199790477752686,
"num_tokens": 1058200427.0,
"step": 11480
},
{
"entropy": 1.12421875,
"epoch": 1.5703157031570316,
"grad_norm": 0.07135938478207242,
"learning_rate": 1.1078624771030013e-06,
"loss": 1.1333,
"mean_token_accuracy": 0.7288059055805206,
"num_tokens": 1059120981.0,
"step": 11490
},
{
"entropy": 1.14140625,
"epoch": 1.5716823834905016,
"grad_norm": 0.0736144212299709,
"learning_rate": 1.1043398619134846e-06,
"loss": 1.143,
"mean_token_accuracy": 0.7261544644832612,
"num_tokens": 1060081905.0,
"step": 11500
},
{
"entropy": 1.1546875,
"epoch": 1.5730490638239716,
"grad_norm": 0.06681349963761672,
"learning_rate": 1.100817246723968e-06,
"loss": 1.1523,
"mean_token_accuracy": 0.7248816847801208,
"num_tokens": 1060987615.0,
"step": 11510
},
{
"entropy": 1.1421875,
"epoch": 1.5744157441574416,
"grad_norm": 0.06702678611858516,
"learning_rate": 1.0972946315344513e-06,
"loss": 1.154,
"mean_token_accuracy": 0.7268297910690308,
"num_tokens": 1061947626.0,
"step": 11520
},
{
"entropy": 1.109765625,
"epoch": 1.5757824244909115,
"grad_norm": 0.07444479025029686,
"learning_rate": 1.0937720163449346e-06,
"loss": 1.1138,
"mean_token_accuracy": 0.7327151238918305,
"num_tokens": 1062853005.0,
"step": 11530
},
{
"entropy": 1.1453125,
"epoch": 1.5771491048243815,
"grad_norm": 0.0766598143446523,
"learning_rate": 1.0902494011554177e-06,
"loss": 1.1422,
"mean_token_accuracy": 0.7267009735107421,
"num_tokens": 1063787432.0,
"step": 11540
},
{
"entropy": 1.115625,
"epoch": 1.5785157851578515,
"grad_norm": 0.07633429129510436,
"learning_rate": 1.086726785965901e-06,
"loss": 1.1184,
"mean_token_accuracy": 0.7320018410682678,
"num_tokens": 1064750440.0,
"step": 11550
},
{
"entropy": 1.173046875,
"epoch": 1.5798824654913215,
"grad_norm": 0.07863631374223312,
"learning_rate": 1.0832041707763844e-06,
"loss": 1.1751,
"mean_token_accuracy": 0.7219699919223785,
"num_tokens": 1065677004.0,
"step": 11560
},
{
"entropy": 1.171875,
"epoch": 1.5812491458247915,
"grad_norm": 0.07315267154063616,
"learning_rate": 1.0796815555868678e-06,
"loss": 1.1848,
"mean_token_accuracy": 0.72130246758461,
"num_tokens": 1066659637.0,
"step": 11570
},
{
"entropy": 1.171875,
"epoch": 1.5826158261582615,
"grad_norm": 0.06452022453223746,
"learning_rate": 1.076158940397351e-06,
"loss": 1.1771,
"mean_token_accuracy": 0.7210237503051757,
"num_tokens": 1067586812.0,
"step": 11580
},
{
"entropy": 1.15078125,
"epoch": 1.5839825064917314,
"grad_norm": 0.07453054466669969,
"learning_rate": 1.0726363252078344e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.7251493215560914,
"num_tokens": 1068476288.0,
"step": 11590
},
{
"entropy": 1.096484375,
"epoch": 1.5853491868252014,
"grad_norm": 0.06938121784368036,
"learning_rate": 1.0691137100183178e-06,
"loss": 1.0893,
"mean_token_accuracy": 0.7386395812034607,
"num_tokens": 1069384818.0,
"step": 11600
},
{
"entropy": 1.098828125,
"epoch": 1.5867158671586716,
"grad_norm": 0.07457625540048386,
"learning_rate": 1.065591094828801e-06,
"loss": 1.1019,
"mean_token_accuracy": 0.7351349532604218,
"num_tokens": 1070358732.0,
"step": 11610
},
{
"entropy": 1.11171875,
"epoch": 1.5880825474921416,
"grad_norm": 0.07353073398919974,
"learning_rate": 1.0620684796392842e-06,
"loss": 1.1054,
"mean_token_accuracy": 0.7327687799930572,
"num_tokens": 1071225249.0,
"step": 11620
},
{
"entropy": 1.0640625,
"epoch": 1.5894492278256116,
"grad_norm": 0.4800690719924305,
"learning_rate": 1.0585458644497676e-06,
"loss": 1.0599,
"mean_token_accuracy": 0.7415197551250458,
"num_tokens": 1072105786.0,
"step": 11630
},
{
"entropy": 1.159375,
"epoch": 1.5908159081590816,
"grad_norm": 0.07937211730380034,
"learning_rate": 1.0550232492602509e-06,
"loss": 1.1664,
"mean_token_accuracy": 0.7223095715045929,
"num_tokens": 1073011100.0,
"step": 11640
},
{
"entropy": 1.1640625,
"epoch": 1.5921825884925516,
"grad_norm": 0.07475877997345942,
"learning_rate": 1.0515006340707342e-06,
"loss": 1.1657,
"mean_token_accuracy": 0.7252254128456116,
"num_tokens": 1073914349.0,
"step": 11650
},
{
"entropy": 1.088671875,
"epoch": 1.5935492688260215,
"grad_norm": 0.07686015843223253,
"learning_rate": 1.0479780188812176e-06,
"loss": 1.0934,
"mean_token_accuracy": 0.7367645323276519,
"num_tokens": 1074787847.0,
"step": 11660
},
{
"entropy": 1.1203125,
"epoch": 1.5949159491594918,
"grad_norm": 0.07279065614810887,
"learning_rate": 1.0444554036917009e-06,
"loss": 1.1274,
"mean_token_accuracy": 0.7297516763210297,
"num_tokens": 1075720122.0,
"step": 11670
},
{
"entropy": 1.14609375,
"epoch": 1.5962826294929617,
"grad_norm": 0.07047729027704507,
"learning_rate": 1.0409327885021842e-06,
"loss": 1.1553,
"mean_token_accuracy": 0.722459328174591,
"num_tokens": 1076658698.0,
"step": 11680
},
{
"entropy": 1.10078125,
"epoch": 1.5976493098264317,
"grad_norm": 0.07214628013636515,
"learning_rate": 1.0374101733126674e-06,
"loss": 1.1161,
"mean_token_accuracy": 0.7328182518482208,
"num_tokens": 1077578515.0,
"step": 11690
},
{
"entropy": 1.08671875,
"epoch": 1.5990159901599017,
"grad_norm": 0.08926917204454757,
"learning_rate": 1.0338875581231507e-06,
"loss": 1.0775,
"mean_token_accuracy": 0.7390931069850921,
"num_tokens": 1078439448.0,
"step": 11700
},
{
"entropy": 1.15,
"epoch": 1.6003826704933717,
"grad_norm": 0.07535103617240142,
"learning_rate": 1.030364942933634e-06,
"loss": 1.1559,
"mean_token_accuracy": 0.7225850820541382,
"num_tokens": 1079335986.0,
"step": 11710
},
{
"entropy": 1.1515625,
"epoch": 1.6017493508268417,
"grad_norm": 0.07217372108165125,
"learning_rate": 1.0268423277441174e-06,
"loss": 1.1533,
"mean_token_accuracy": 0.724703335762024,
"num_tokens": 1080224151.0,
"step": 11720
},
{
"entropy": 1.140625,
"epoch": 1.6031160311603116,
"grad_norm": 0.07657908078798345,
"learning_rate": 1.0233197125546005e-06,
"loss": 1.1384,
"mean_token_accuracy": 0.7291032195091247,
"num_tokens": 1081090458.0,
"step": 11730
},
{
"entropy": 1.1453125,
"epoch": 1.6044827114937816,
"grad_norm": 0.07368220777392756,
"learning_rate": 1.0197970973650838e-06,
"loss": 1.1476,
"mean_token_accuracy": 0.7261252403259277,
"num_tokens": 1082015833.0,
"step": 11740
},
{
"entropy": 1.1328125,
"epoch": 1.6058493918272516,
"grad_norm": 0.07401251552119684,
"learning_rate": 1.0162744821755674e-06,
"loss": 1.1464,
"mean_token_accuracy": 0.7272646725177765,
"num_tokens": 1082949779.0,
"step": 11750
},
{
"entropy": 1.1109375,
"epoch": 1.6072160721607216,
"grad_norm": 0.07625682287757057,
"learning_rate": 1.0127518669860505e-06,
"loss": 1.1126,
"mean_token_accuracy": 0.7324575543403625,
"num_tokens": 1083865005.0,
"step": 11760
},
{
"entropy": 1.16171875,
"epoch": 1.6085827524941916,
"grad_norm": 0.08271346336750815,
"learning_rate": 1.0092292517965338e-06,
"loss": 1.1703,
"mean_token_accuracy": 0.7241205751895905,
"num_tokens": 1084813965.0,
"step": 11770
},
{
"entropy": 1.13671875,
"epoch": 1.6099494328276616,
"grad_norm": 0.07663355868438837,
"learning_rate": 1.0057066366070172e-06,
"loss": 1.1393,
"mean_token_accuracy": 0.728145158290863,
"num_tokens": 1085727725.0,
"step": 11780
},
{
"entropy": 1.1078125,
"epoch": 1.6113161131611315,
"grad_norm": 0.07434464742457553,
"learning_rate": 1.0021840214175005e-06,
"loss": 1.1057,
"mean_token_accuracy": 0.7350985407829285,
"num_tokens": 1086639180.0,
"step": 11790
},
{
"entropy": 1.13203125,
"epoch": 1.6126827934946015,
"grad_norm": 0.07315454605427006,
"learning_rate": 9.986614062279836e-07,
"loss": 1.1392,
"mean_token_accuracy": 0.7280153810977936,
"num_tokens": 1087573751.0,
"step": 11800
},
{
"entropy": 1.07890625,
"epoch": 1.6140494738280715,
"grad_norm": 0.07461580067124629,
"learning_rate": 9.95138791038467e-07,
"loss": 1.0776,
"mean_token_accuracy": 0.7395806729793548,
"num_tokens": 1088506212.0,
"step": 11810
},
{
"entropy": 1.108203125,
"epoch": 1.6154161541615415,
"grad_norm": 0.07436232353671471,
"learning_rate": 9.916161758489503e-07,
"loss": 1.0965,
"mean_token_accuracy": 0.7366746842861176,
"num_tokens": 1089428943.0,
"step": 11820
},
{
"entropy": 1.13125,
"epoch": 1.6167828344950115,
"grad_norm": 0.0986514440466465,
"learning_rate": 9.880935606594336e-07,
"loss": 1.1371,
"mean_token_accuracy": 0.7287112712860108,
"num_tokens": 1090406547.0,
"step": 11830
},
{
"entropy": 1.14140625,
"epoch": 1.6181495148284815,
"grad_norm": 0.07060335929608542,
"learning_rate": 9.84570945469917e-07,
"loss": 1.1586,
"mean_token_accuracy": 0.7252133607864379,
"num_tokens": 1091320300.0,
"step": 11840
},
{
"entropy": 1.138671875,
"epoch": 1.6195161951619517,
"grad_norm": 0.07495396703692456,
"learning_rate": 9.810483302804003e-07,
"loss": 1.1302,
"mean_token_accuracy": 0.7287598252296448,
"num_tokens": 1092235124.0,
"step": 11850
},
{
"entropy": 1.1234375,
"epoch": 1.6208828754954216,
"grad_norm": 0.08672376037079943,
"learning_rate": 9.775257150908836e-07,
"loss": 1.1165,
"mean_token_accuracy": 0.7313764452934265,
"num_tokens": 1093131170.0,
"step": 11860
},
{
"entropy": 1.1390625,
"epoch": 1.6222495558288916,
"grad_norm": 0.07686764086715907,
"learning_rate": 9.740030999013668e-07,
"loss": 1.1491,
"mean_token_accuracy": 0.7277199804782868,
"num_tokens": 1094028259.0,
"step": 11870
},
{
"entropy": 1.098046875,
"epoch": 1.6236162361623616,
"grad_norm": 0.07092121379151689,
"learning_rate": 9.7048048471185e-07,
"loss": 1.0912,
"mean_token_accuracy": 0.7338119983673096,
"num_tokens": 1094923700.0,
"step": 11880
},
{
"entropy": 1.140625,
"epoch": 1.6249829164958316,
"grad_norm": 0.08628170560746415,
"learning_rate": 9.669578695223334e-07,
"loss": 1.1463,
"mean_token_accuracy": 0.726482504606247,
"num_tokens": 1095863029.0,
"step": 11890
},
{
"entropy": 1.11484375,
"epoch": 1.6263495968293016,
"grad_norm": 0.07348885958942508,
"learning_rate": 9.634352543328168e-07,
"loss": 1.1165,
"mean_token_accuracy": 0.7314401745796204,
"num_tokens": 1096756795.0,
"step": 11900
},
{
"entropy": 1.130078125,
"epoch": 1.6277162771627718,
"grad_norm": 0.08090916552323121,
"learning_rate": 9.599126391433e-07,
"loss": 1.1338,
"mean_token_accuracy": 0.7289749383926392,
"num_tokens": 1097677930.0,
"step": 11910
},
{
"entropy": 1.15859375,
"epoch": 1.6290829574962418,
"grad_norm": 0.07602356110510554,
"learning_rate": 9.563900239537834e-07,
"loss": 1.1609,
"mean_token_accuracy": 0.7230493247509002,
"num_tokens": 1098586303.0,
"step": 11920
},
{
"entropy": 1.10390625,
"epoch": 1.6304496378297118,
"grad_norm": 0.07387398657673559,
"learning_rate": 9.528674087642667e-07,
"loss": 1.1102,
"mean_token_accuracy": 0.7329406201839447,
"num_tokens": 1099532417.0,
"step": 11930
},
{
"entropy": 1.09375,
"epoch": 1.6318163181631817,
"grad_norm": 0.07301436234729337,
"learning_rate": 9.4934479357475e-07,
"loss": 1.0816,
"mean_token_accuracy": 0.7398154258728027,
"num_tokens": 1100461088.0,
"step": 11940
},
{
"entropy": 1.159375,
"epoch": 1.6331829984966517,
"grad_norm": 0.07666340819975398,
"learning_rate": 9.458221783852332e-07,
"loss": 1.1537,
"mean_token_accuracy": 0.7248700141906739,
"num_tokens": 1101423253.0,
"step": 11950
},
{
"entropy": 1.1265625,
"epoch": 1.6345496788301217,
"grad_norm": 0.09076661561499849,
"learning_rate": 9.422995631957166e-07,
"loss": 1.1262,
"mean_token_accuracy": 0.7307249546051026,
"num_tokens": 1102306270.0,
"step": 11960
},
{
"entropy": 1.15703125,
"epoch": 1.6359163591635917,
"grad_norm": 0.0816366998983449,
"learning_rate": 9.387769480061999e-07,
"loss": 1.16,
"mean_token_accuracy": 0.7248102009296418,
"num_tokens": 1103221346.0,
"step": 11970
},
{
"entropy": 1.1328125,
"epoch": 1.6372830394970617,
"grad_norm": 0.077894253697232,
"learning_rate": 9.352543328166831e-07,
"loss": 1.1268,
"mean_token_accuracy": 0.7305737376213074,
"num_tokens": 1104181732.0,
"step": 11980
},
{
"entropy": 1.12265625,
"epoch": 1.6386497198305316,
"grad_norm": 0.06944067284938253,
"learning_rate": 9.317317176271665e-07,
"loss": 1.1276,
"mean_token_accuracy": 0.7302307486534119,
"num_tokens": 1105059442.0,
"step": 11990
},
{
"entropy": 1.1578125,
"epoch": 1.6400164001640016,
"grad_norm": 0.06209714753795566,
"learning_rate": 9.282091024376498e-07,
"loss": 1.1656,
"mean_token_accuracy": 0.7224168121814728,
"num_tokens": 1106044386.0,
"step": 12000
},
{
"entropy": 1.1453125,
"epoch": 1.6413830804974716,
"grad_norm": 0.07290580583006306,
"learning_rate": 9.246864872481331e-07,
"loss": 1.1441,
"mean_token_accuracy": 0.7274718344211578,
"num_tokens": 919957.0,
"step": 12010
},
{
"entropy": 1.1078125,
"epoch": 1.6427497608309416,
"grad_norm": 0.07876433029104002,
"learning_rate": 9.211638720586164e-07,
"loss": 1.1098,
"mean_token_accuracy": 0.7308490097522735,
"num_tokens": 1791388.0,
"step": 12020
},
{
"entropy": 1.12578125,
"epoch": 1.6441164411644116,
"grad_norm": 0.0726092521299502,
"learning_rate": 9.176412568690997e-07,
"loss": 1.1265,
"mean_token_accuracy": 0.7295171737670898,
"num_tokens": 2716370.0,
"step": 12030
},
{
"entropy": 1.140625,
"epoch": 1.6454831214978816,
"grad_norm": 0.07421155242774707,
"learning_rate": 9.14118641679583e-07,
"loss": 1.1361,
"mean_token_accuracy": 0.7288490355014801,
"num_tokens": 3631132.0,
"step": 12040
},
{
"entropy": 1.1375,
"epoch": 1.6468498018313515,
"grad_norm": 0.08086171027570548,
"learning_rate": 9.105960264900663e-07,
"loss": 1.1352,
"mean_token_accuracy": 0.7290159106254578,
"num_tokens": 4509030.0,
"step": 12050
},
{
"entropy": 1.10234375,
"epoch": 1.6482164821648215,
"grad_norm": 0.07194889056221969,
"learning_rate": 9.070734113005496e-07,
"loss": 1.0918,
"mean_token_accuracy": 0.7371569454669953,
"num_tokens": 5398084.0,
"step": 12060
},
{
"entropy": 1.123046875,
"epoch": 1.6495831624982915,
"grad_norm": 0.07202099912967813,
"learning_rate": 9.035507961110329e-07,
"loss": 1.1223,
"mean_token_accuracy": 0.7308742702007294,
"num_tokens": 6266301.0,
"step": 12070
},
{
"entropy": 1.060546875,
"epoch": 1.6509498428317615,
"grad_norm": 0.0760883240025025,
"learning_rate": 9.000281809215163e-07,
"loss": 1.0468,
"mean_token_accuracy": 0.7447024643421173,
"num_tokens": 7154758.0,
"step": 12080
},
{
"entropy": 1.0875,
"epoch": 1.6523165231652317,
"grad_norm": 0.07107232365856134,
"learning_rate": 8.965055657319995e-07,
"loss": 1.0947,
"mean_token_accuracy": 0.7370666086673736,
"num_tokens": 8080747.0,
"step": 12090
},
{
"entropy": 1.1546875,
"epoch": 1.6536832034987017,
"grad_norm": 0.0739307470943422,
"learning_rate": 8.929829505424828e-07,
"loss": 1.1586,
"mean_token_accuracy": 0.7251729488372802,
"num_tokens": 8968701.0,
"step": 12100
},
{
"entropy": 1.12890625,
"epoch": 1.6550498838321717,
"grad_norm": 0.07910679945984896,
"learning_rate": 8.894603353529662e-07,
"loss": 1.1338,
"mean_token_accuracy": 0.7288496911525726,
"num_tokens": 9859682.0,
"step": 12110
},
{
"entropy": 1.133203125,
"epoch": 1.6564165641656416,
"grad_norm": 0.08197839256509593,
"learning_rate": 8.859377201634495e-07,
"loss": 1.1311,
"mean_token_accuracy": 0.7299401938915253,
"num_tokens": 10758105.0,
"step": 12120
},
{
"entropy": 1.15703125,
"epoch": 1.6577832444991116,
"grad_norm": 0.07332706845208195,
"learning_rate": 8.824151049739326e-07,
"loss": 1.16,
"mean_token_accuracy": 0.7228416562080383,
"num_tokens": 11738390.0,
"step": 12130
},
{
"entropy": 1.12109375,
"epoch": 1.6591499248325816,
"grad_norm": 0.07747845858594067,
"learning_rate": 8.788924897844161e-07,
"loss": 1.1206,
"mean_token_accuracy": 0.732740706205368,
"num_tokens": 12678741.0,
"step": 12140
},
{
"entropy": 1.11328125,
"epoch": 1.6605166051660518,
"grad_norm": 0.10958724834787456,
"learning_rate": 8.753698745948994e-07,
"loss": 1.111,
"mean_token_accuracy": 0.7334132671356202,
"num_tokens": 13662733.0,
"step": 12150
},
{
"entropy": 1.126953125,
"epoch": 1.6618832854995218,
"grad_norm": 0.07439329811916963,
"learning_rate": 8.718472594053825e-07,
"loss": 1.1256,
"mean_token_accuracy": 0.7298170864582062,
"num_tokens": 14575264.0,
"step": 12160
},
{
"entropy": 1.1046875,
"epoch": 1.6632499658329918,
"grad_norm": 0.08045071429171007,
"learning_rate": 8.683246442158659e-07,
"loss": 1.1205,
"mean_token_accuracy": 0.7306158304214477,
"num_tokens": 15538810.0,
"step": 12170
},
{
"entropy": 1.1,
"epoch": 1.6646166461664618,
"grad_norm": 0.07128309744458897,
"learning_rate": 8.648020290263492e-07,
"loss": 1.1004,
"mean_token_accuracy": 0.7361827075481415,
"num_tokens": 16472619.0,
"step": 12180
},
{
"entropy": 1.10234375,
"epoch": 1.6659833264999317,
"grad_norm": 0.08009830865925312,
"learning_rate": 8.612794138368325e-07,
"loss": 1.1024,
"mean_token_accuracy": 0.7343206942081452,
"num_tokens": 17377623.0,
"step": 12190
},
{
"entropy": 1.12265625,
"epoch": 1.6673500068334017,
"grad_norm": 0.0714652452941825,
"learning_rate": 8.577567986473158e-07,
"loss": 1.1264,
"mean_token_accuracy": 0.7308506786823272,
"num_tokens": 18334132.0,
"step": 12200
},
{
"entropy": 1.13671875,
"epoch": 1.6687166871668717,
"grad_norm": 0.06998021095801672,
"learning_rate": 8.542341834577991e-07,
"loss": 1.1368,
"mean_token_accuracy": 0.7289149701595307,
"num_tokens": 19297366.0,
"step": 12210
},
{
"entropy": 1.1390625,
"epoch": 1.6700833675003417,
"grad_norm": 0.09247413207760388,
"learning_rate": 8.507115682682824e-07,
"loss": 1.1426,
"mean_token_accuracy": 0.7277171432971954,
"num_tokens": 20237051.0,
"step": 12220
},
{
"entropy": 1.103125,
"epoch": 1.6714500478338117,
"grad_norm": 0.08414821059755431,
"learning_rate": 8.471889530787658e-07,
"loss": 1.1081,
"mean_token_accuracy": 0.7334043741226196,
"num_tokens": 21166451.0,
"step": 12230
},
{
"entropy": 1.1484375,
"epoch": 1.6728167281672817,
"grad_norm": 0.08172484208540767,
"learning_rate": 8.43666337889249e-07,
"loss": 1.1563,
"mean_token_accuracy": 0.7254205703735351,
"num_tokens": 22118238.0,
"step": 12240
},
{
"entropy": 1.107421875,
"epoch": 1.6741834085007516,
"grad_norm": 0.07773932362215462,
"learning_rate": 8.401437226997323e-07,
"loss": 1.1037,
"mean_token_accuracy": 0.7334123253822327,
"num_tokens": 23052978.0,
"step": 12250
},
{
"entropy": 1.09453125,
"epoch": 1.6755500888342216,
"grad_norm": 0.0850995336332641,
"learning_rate": 8.366211075102157e-07,
"loss": 1.0893,
"mean_token_accuracy": 0.7369394659996032,
"num_tokens": 23976168.0,
"step": 12260
},
{
"entropy": 1.0765625,
"epoch": 1.6769167691676916,
"grad_norm": 0.07087915634215679,
"learning_rate": 8.330984923206989e-07,
"loss": 1.072,
"mean_token_accuracy": 0.7407988727092742,
"num_tokens": 24900925.0,
"step": 12270
},
{
"entropy": 1.115625,
"epoch": 1.6782834495011616,
"grad_norm": 0.07279620557264904,
"learning_rate": 8.295758771311822e-07,
"loss": 1.1212,
"mean_token_accuracy": 0.7318483829498291,
"num_tokens": 25799980.0,
"step": 12280
},
{
"entropy": 1.14296875,
"epoch": 1.6796501298346316,
"grad_norm": 0.07322460871348639,
"learning_rate": 8.260532619416656e-07,
"loss": 1.1389,
"mean_token_accuracy": 0.729259067773819,
"num_tokens": 26704876.0,
"step": 12290
},
{
"entropy": 1.125,
"epoch": 1.6810168101681016,
"grad_norm": 0.07214039453670497,
"learning_rate": 8.225306467521489e-07,
"loss": 1.1254,
"mean_token_accuracy": 0.7326715290546417,
"num_tokens": 27624201.0,
"step": 12300
},
{
"entropy": 1.1125,
"epoch": 1.6823834905015715,
"grad_norm": 0.07152631669148783,
"learning_rate": 8.190080315626321e-07,
"loss": 1.1194,
"mean_token_accuracy": 0.7320015490055084,
"num_tokens": 28539868.0,
"step": 12310
},
{
"entropy": 1.0828125,
"epoch": 1.6837501708350415,
"grad_norm": 0.07325476560022486,
"learning_rate": 8.154854163731155e-07,
"loss": 1.0868,
"mean_token_accuracy": 0.7376817345619202,
"num_tokens": 29467296.0,
"step": 12320
},
{
"entropy": 1.0875,
"epoch": 1.6851168511685117,
"grad_norm": 0.07059340727642241,
"learning_rate": 8.119628011835988e-07,
"loss": 1.0915,
"mean_token_accuracy": 0.7372280836105347,
"num_tokens": 30380662.0,
"step": 12330
},
{
"entropy": 1.111328125,
"epoch": 1.6864835315019817,
"grad_norm": 0.08197106168053442,
"learning_rate": 8.084401859940821e-07,
"loss": 1.1127,
"mean_token_accuracy": 0.7314554929733277,
"num_tokens": 31304971.0,
"step": 12340
},
{
"entropy": 1.090625,
"epoch": 1.6878502118354517,
"grad_norm": 0.0747152206646852,
"learning_rate": 8.049175708045654e-07,
"loss": 1.0855,
"mean_token_accuracy": 0.7367800831794739,
"num_tokens": 32196022.0,
"step": 12350
},
{
"entropy": 1.1234375,
"epoch": 1.6892168921689217,
"grad_norm": 0.08043726107061462,
"learning_rate": 8.013949556150487e-07,
"loss": 1.1289,
"mean_token_accuracy": 0.7304772675037384,
"num_tokens": 33135862.0,
"step": 12360
},
{
"entropy": 1.101953125,
"epoch": 1.6905835725023917,
"grad_norm": 0.08227812575001058,
"learning_rate": 7.97872340425532e-07,
"loss": 1.1061,
"mean_token_accuracy": 0.7345277547836304,
"num_tokens": 34067010.0,
"step": 12370
},
{
"entropy": 1.10859375,
"epoch": 1.6919502528358616,
"grad_norm": 0.07273249184959597,
"learning_rate": 7.943497252360153e-07,
"loss": 1.0933,
"mean_token_accuracy": 0.7361769437789917,
"num_tokens": 35013313.0,
"step": 12380
},
{
"entropy": 1.159375,
"epoch": 1.6933169331693319,
"grad_norm": 0.11619339492280206,
"learning_rate": 7.908271100464986e-07,
"loss": 1.1576,
"mean_token_accuracy": 0.7236440718173981,
"num_tokens": 35906978.0,
"step": 12390
},
{
"entropy": 1.1484375,
"epoch": 1.6946836135028018,
"grad_norm": 0.07120017172316383,
"learning_rate": 7.873044948569819e-07,
"loss": 1.1529,
"mean_token_accuracy": 0.726298725605011,
"num_tokens": 36909392.0,
"step": 12400
},
{
"entropy": 1.096875,
"epoch": 1.6960502938362718,
"grad_norm": 0.07232236925599657,
"learning_rate": 7.837818796674653e-07,
"loss": 1.1012,
"mean_token_accuracy": 0.7353088438510895,
"num_tokens": 37900621.0,
"step": 12410
},
{
"entropy": 1.137890625,
"epoch": 1.6974169741697418,
"grad_norm": 0.07258249286079738,
"learning_rate": 7.802592644779484e-07,
"loss": 1.143,
"mean_token_accuracy": 0.726912897825241,
"num_tokens": 38852623.0,
"step": 12420
},
{
"entropy": 1.17578125,
"epoch": 1.6987836545032118,
"grad_norm": 0.07582877289388208,
"learning_rate": 7.767366492884317e-07,
"loss": 1.1725,
"mean_token_accuracy": 0.7218635678291321,
"num_tokens": 39767551.0,
"step": 12430
},
{
"entropy": 1.09140625,
"epoch": 1.7001503348366818,
"grad_norm": 0.06995090985414679,
"learning_rate": 7.732140340989152e-07,
"loss": 1.1004,
"mean_token_accuracy": 0.73497274518013,
"num_tokens": 40712705.0,
"step": 12440
},
{
"entropy": 1.10703125,
"epoch": 1.7015170151701517,
"grad_norm": 0.08126150381574722,
"learning_rate": 7.696914189093985e-07,
"loss": 1.0992,
"mean_token_accuracy": 0.7355374157428741,
"num_tokens": 41637522.0,
"step": 12450
},
{
"entropy": 1.108984375,
"epoch": 1.7028836955036217,
"grad_norm": 0.07431403928612604,
"learning_rate": 7.661688037198816e-07,
"loss": 1.0979,
"mean_token_accuracy": 0.73477823138237,
"num_tokens": 42550960.0,
"step": 12460
},
{
"entropy": 1.1421875,
"epoch": 1.7042503758370917,
"grad_norm": 0.07119816660657463,
"learning_rate": 7.62646188530365e-07,
"loss": 1.1401,
"mean_token_accuracy": 0.7306041121482849,
"num_tokens": 43441422.0,
"step": 12470
},
{
"entropy": 1.16640625,
"epoch": 1.7056170561705617,
"grad_norm": 0.07297716931850312,
"learning_rate": 7.591235733408483e-07,
"loss": 1.1702,
"mean_token_accuracy": 0.7232620179653168,
"num_tokens": 44354493.0,
"step": 12480
},
{
"entropy": 1.123828125,
"epoch": 1.7069837365040317,
"grad_norm": 0.0768333995940095,
"learning_rate": 7.556009581513315e-07,
"loss": 1.1212,
"mean_token_accuracy": 0.7282783329486847,
"num_tokens": 45289906.0,
"step": 12490
},
{
"entropy": 1.133203125,
"epoch": 1.7083504168375017,
"grad_norm": 0.07147015667004314,
"learning_rate": 7.520783429618149e-07,
"loss": 1.1434,
"mean_token_accuracy": 0.7280852138996124,
"num_tokens": 46221626.0,
"step": 12500
},
{
"entropy": 1.12265625,
"epoch": 1.7097170971709716,
"grad_norm": 0.07159141996561541,
"learning_rate": 7.485557277722982e-07,
"loss": 1.114,
"mean_token_accuracy": 0.7313182473182678,
"num_tokens": 47164066.0,
"step": 12510
},
{
"entropy": 1.125,
"epoch": 1.7110837775044416,
"grad_norm": 0.07311982018166459,
"learning_rate": 7.450331125827815e-07,
"loss": 1.1203,
"mean_token_accuracy": 0.7338204205036163,
"num_tokens": 48118378.0,
"step": 12520
},
{
"entropy": 1.134375,
"epoch": 1.7124504578379116,
"grad_norm": 0.07506050803091907,
"learning_rate": 7.415104973932648e-07,
"loss": 1.1272,
"mean_token_accuracy": 0.7291731595993042,
"num_tokens": 49039581.0,
"step": 12530
},
{
"entropy": 1.1375,
"epoch": 1.7138171381713816,
"grad_norm": 0.07829961599338793,
"learning_rate": 7.379878822037481e-07,
"loss": 1.1419,
"mean_token_accuracy": 0.7262904107570648,
"num_tokens": 49973038.0,
"step": 12540
},
{
"entropy": 1.15703125,
"epoch": 1.7151838185048516,
"grad_norm": 0.07605227940483128,
"learning_rate": 7.344652670142314e-07,
"loss": 1.1601,
"mean_token_accuracy": 0.7243201255798339,
"num_tokens": 50885170.0,
"step": 12550
},
{
"entropy": 1.112890625,
"epoch": 1.7165504988383216,
"grad_norm": 0.07482583429596376,
"learning_rate": 7.309426518247147e-07,
"loss": 1.1065,
"mean_token_accuracy": 0.7335156798362732,
"num_tokens": 51807456.0,
"step": 12560
},
{
"entropy": 1.12265625,
"epoch": 1.7179171791717918,
"grad_norm": 0.07318155608342981,
"learning_rate": 7.27420036635198e-07,
"loss": 1.1352,
"mean_token_accuracy": 0.7287726044654846,
"num_tokens": 52727249.0,
"step": 12570
},
{
"entropy": 1.09609375,
"epoch": 1.7192838595052617,
"grad_norm": 0.0785757230201231,
"learning_rate": 7.238974214456813e-07,
"loss": 1.0977,
"mean_token_accuracy": 0.735130226612091,
"num_tokens": 53608676.0,
"step": 12580
},
{
"entropy": 1.18046875,
"epoch": 1.7206505398387317,
"grad_norm": 0.07032630473367685,
"learning_rate": 7.203748062561647e-07,
"loss": 1.2013,
"mean_token_accuracy": 0.7223058581352234,
"num_tokens": 54545545.0,
"step": 12590
},
{
"entropy": 1.12890625,
"epoch": 1.7220172201722017,
"grad_norm": 0.072406484408865,
"learning_rate": 7.168521910666479e-07,
"loss": 1.1348,
"mean_token_accuracy": 0.7288249552249908,
"num_tokens": 55507524.0,
"step": 12600
},
{
"entropy": 1.1203125,
"epoch": 1.7233839005056717,
"grad_norm": 0.07173198893116844,
"learning_rate": 7.133295758771312e-07,
"loss": 1.1216,
"mean_token_accuracy": 0.7312948524951934,
"num_tokens": 56422176.0,
"step": 12610
},
{
"entropy": 1.0890625,
"epoch": 1.7247505808391417,
"grad_norm": 0.08523692302480838,
"learning_rate": 7.098069606876146e-07,
"loss": 1.0896,
"mean_token_accuracy": 0.7361468434333801,
"num_tokens": 57327436.0,
"step": 12620
},
{
"entropy": 1.115234375,
"epoch": 1.7261172611726119,
"grad_norm": 0.07578332719943,
"learning_rate": 7.062843454980979e-07,
"loss": 1.1158,
"mean_token_accuracy": 0.7313060879707336,
"num_tokens": 58262622.0,
"step": 12630
},
{
"entropy": 1.12421875,
"epoch": 1.7274839415060819,
"grad_norm": 0.06870240918616236,
"learning_rate": 7.027617303085811e-07,
"loss": 1.1235,
"mean_token_accuracy": 0.7301866590976716,
"num_tokens": 59229150.0,
"step": 12640
},
{
"entropy": 1.084375,
"epoch": 1.7288506218395518,
"grad_norm": 0.0717626669172362,
"learning_rate": 6.992391151190645e-07,
"loss": 1.0778,
"mean_token_accuracy": 0.738699209690094,
"num_tokens": 60166758.0,
"step": 12650
},
{
"entropy": 1.1546875,
"epoch": 1.7302173021730218,
"grad_norm": 0.06620131113474746,
"learning_rate": 6.957164999295478e-07,
"loss": 1.1589,
"mean_token_accuracy": 0.7244890868663788,
"num_tokens": 61116905.0,
"step": 12660
},
{
"entropy": 1.15390625,
"epoch": 1.7315839825064918,
"grad_norm": 0.07831428008081982,
"learning_rate": 6.921938847400309e-07,
"loss": 1.1475,
"mean_token_accuracy": 0.726482379436493,
"num_tokens": 62053868.0,
"step": 12670
},
{
"entropy": 1.096484375,
"epoch": 1.7329506628399618,
"grad_norm": 0.07305074254237197,
"learning_rate": 6.886712695505144e-07,
"loss": 1.0915,
"mean_token_accuracy": 0.7371149063110352,
"num_tokens": 62981293.0,
"step": 12680
},
{
"entropy": 1.083203125,
"epoch": 1.7343173431734318,
"grad_norm": 0.08021783356402858,
"learning_rate": 6.851486543609977e-07,
"loss": 1.0807,
"mean_token_accuracy": 0.7395688354969024,
"num_tokens": 63887500.0,
"step": 12690
},
{
"entropy": 1.146875,
"epoch": 1.7356840235069018,
"grad_norm": 0.07867911760170215,
"learning_rate": 6.816260391714811e-07,
"loss": 1.1543,
"mean_token_accuracy": 0.7237483203411103,
"num_tokens": 64850362.0,
"step": 12700
},
{
"entropy": 1.088671875,
"epoch": 1.7370507038403717,
"grad_norm": 0.07723036909024736,
"learning_rate": 6.781034239819642e-07,
"loss": 1.0953,
"mean_token_accuracy": 0.7372272253036499,
"num_tokens": 65777926.0,
"step": 12710
},
{
"entropy": 1.15625,
"epoch": 1.7384173841738417,
"grad_norm": 0.07446173336528605,
"learning_rate": 6.745808087924475e-07,
"loss": 1.1603,
"mean_token_accuracy": 0.7243685781955719,
"num_tokens": 66703323.0,
"step": 12720
},
{
"entropy": 1.130078125,
"epoch": 1.7397840645073117,
"grad_norm": 0.07574031230981015,
"learning_rate": 6.710581936029308e-07,
"loss": 1.1368,
"mean_token_accuracy": 0.7293879926204682,
"num_tokens": 67612471.0,
"step": 12730
},
{
"entropy": 1.1203125,
"epoch": 1.7411507448407817,
"grad_norm": 0.07744374179817322,
"learning_rate": 6.675355784134143e-07,
"loss": 1.1185,
"mean_token_accuracy": 0.7338252246379853,
"num_tokens": 68524718.0,
"step": 12740
},
{
"entropy": 1.123046875,
"epoch": 1.7425174251742517,
"grad_norm": 0.06734364523925297,
"learning_rate": 6.640129632238974e-07,
"loss": 1.1246,
"mean_token_accuracy": 0.7291808366775513,
"num_tokens": 69464905.0,
"step": 12750
},
{
"entropy": 1.11328125,
"epoch": 1.7438841055077217,
"grad_norm": 0.07803383536052724,
"learning_rate": 6.604903480343807e-07,
"loss": 1.1202,
"mean_token_accuracy": 0.7322011768817902,
"num_tokens": 70386359.0,
"step": 12760
},
{
"entropy": 1.13671875,
"epoch": 1.7452507858411916,
"grad_norm": 0.07577834782931114,
"learning_rate": 6.569677328448641e-07,
"loss": 1.1366,
"mean_token_accuracy": 0.7294918119907379,
"num_tokens": 71300383.0,
"step": 12770
},
{
"entropy": 1.13359375,
"epoch": 1.7466174661746616,
"grad_norm": 0.094127311621557,
"learning_rate": 6.534451176553473e-07,
"loss": 1.1489,
"mean_token_accuracy": 0.7265759408473969,
"num_tokens": 72189404.0,
"step": 12780
},
{
"entropy": 1.1296875,
"epoch": 1.7479841465081316,
"grad_norm": 0.06685520456952734,
"learning_rate": 6.499225024658306e-07,
"loss": 1.1439,
"mean_token_accuracy": 0.7267627060413361,
"num_tokens": 73134740.0,
"step": 12790
},
{
"entropy": 1.178125,
"epoch": 1.7493508268416016,
"grad_norm": 0.07100133895270179,
"learning_rate": 6.46399887276314e-07,
"loss": 1.1873,
"mean_token_accuracy": 0.7198917210102082,
"num_tokens": 74084065.0,
"step": 12800
},
{
"entropy": 1.071484375,
"epoch": 1.7507175071750718,
"grad_norm": 0.07516461842540706,
"learning_rate": 6.428772720867973e-07,
"loss": 1.0622,
"mean_token_accuracy": 0.7413422644138337,
"num_tokens": 75002128.0,
"step": 12810
},
{
"entropy": 1.15234375,
"epoch": 1.7520841875085418,
"grad_norm": 0.07568727622781783,
"learning_rate": 6.393546568972805e-07,
"loss": 1.1436,
"mean_token_accuracy": 0.7274211347103119,
"num_tokens": 75915915.0,
"step": 12820
},
{
"entropy": 1.12890625,
"epoch": 1.7534508678420118,
"grad_norm": 0.07708869768175645,
"learning_rate": 6.358320417077639e-07,
"loss": 1.1281,
"mean_token_accuracy": 0.731033718585968,
"num_tokens": 76866359.0,
"step": 12830
},
{
"entropy": 1.18125,
"epoch": 1.7548175481754817,
"grad_norm": 0.08121134965801352,
"learning_rate": 6.323094265182472e-07,
"loss": 1.1834,
"mean_token_accuracy": 0.7210062205791473,
"num_tokens": 77774995.0,
"step": 12840
},
{
"entropy": 1.146875,
"epoch": 1.7561842285089517,
"grad_norm": 0.07631545549125462,
"learning_rate": 6.287868113287306e-07,
"loss": 1.1492,
"mean_token_accuracy": 0.7282541215419769,
"num_tokens": 78694015.0,
"step": 12850
},
{
"entropy": 1.091796875,
"epoch": 1.7575509088424217,
"grad_norm": 0.07327018299796387,
"learning_rate": 6.252641961392138e-07,
"loss": 1.0868,
"mean_token_accuracy": 0.7383320987224579,
"num_tokens": 79606169.0,
"step": 12860
},
{
"entropy": 1.18203125,
"epoch": 1.758917589175892,
"grad_norm": 0.09446299449992138,
"learning_rate": 6.217415809496971e-07,
"loss": 1.1899,
"mean_token_accuracy": 0.7184771478176117,
"num_tokens": 80471663.0,
"step": 12870
},
{
"entropy": 1.151953125,
"epoch": 1.760284269509362,
"grad_norm": 0.08784439631053716,
"learning_rate": 6.182189657601804e-07,
"loss": 1.149,
"mean_token_accuracy": 0.7271250307559967,
"num_tokens": 81394566.0,
"step": 12880
},
{
"entropy": 1.13671875,
"epoch": 1.7616509498428319,
"grad_norm": 0.07871358373852338,
"learning_rate": 6.146963505706637e-07,
"loss": 1.1422,
"mean_token_accuracy": 0.7273901641368866,
"num_tokens": 82357354.0,
"step": 12890
},
{
"entropy": 1.09296875,
"epoch": 1.7630176301763019,
"grad_norm": 0.06677100594327787,
"learning_rate": 6.11173735381147e-07,
"loss": 1.0859,
"mean_token_accuracy": 0.7380281448364258,
"num_tokens": 83340201.0,
"step": 12900
},
{
"entropy": 1.1109375,
"epoch": 1.7643843105097718,
"grad_norm": 0.0803796724676498,
"learning_rate": 6.076511201916304e-07,
"loss": 1.1146,
"mean_token_accuracy": 0.7330704450607299,
"num_tokens": 84241511.0,
"step": 12910
},
{
"entropy": 1.1125,
"epoch": 1.7657509908432418,
"grad_norm": 0.07520474003309184,
"learning_rate": 6.041285050021136e-07,
"loss": 1.1164,
"mean_token_accuracy": 0.7305477738380433,
"num_tokens": 85197680.0,
"step": 12920
},
{
"entropy": 1.10859375,
"epoch": 1.7671176711767118,
"grad_norm": 0.07297290873592667,
"learning_rate": 6.006058898125969e-07,
"loss": 1.1043,
"mean_token_accuracy": 0.735247677564621,
"num_tokens": 86126379.0,
"step": 12930
},
{
"entropy": 1.20234375,
"epoch": 1.7684843515101818,
"grad_norm": 0.07327440460396896,
"learning_rate": 5.970832746230803e-07,
"loss": 1.2141,
"mean_token_accuracy": 0.7147914588451385,
"num_tokens": 87048573.0,
"step": 12940
},
{
"entropy": 1.115234375,
"epoch": 1.7698510318436518,
"grad_norm": 0.0791449506955321,
"learning_rate": 5.935606594335636e-07,
"loss": 1.1178,
"mean_token_accuracy": 0.7305992186069489,
"num_tokens": 87970166.0,
"step": 12950
},
{
"entropy": 1.1609375,
"epoch": 1.7712177121771218,
"grad_norm": 0.07822335269189397,
"learning_rate": 5.900380442440468e-07,
"loss": 1.1767,
"mean_token_accuracy": 0.7231527149677277,
"num_tokens": 88850407.0,
"step": 12960
},
{
"entropy": 1.079296875,
"epoch": 1.7725843925105917,
"grad_norm": 0.0742705871691746,
"learning_rate": 5.8651542905453e-07,
"loss": 1.0773,
"mean_token_accuracy": 0.7392765700817108,
"num_tokens": 89772787.0,
"step": 12970
},
{
"entropy": 1.15,
"epoch": 1.7739510728440617,
"grad_norm": 0.06970519460785432,
"learning_rate": 5.829928138650135e-07,
"loss": 1.1583,
"mean_token_accuracy": 0.7229465186595917,
"num_tokens": 90705435.0,
"step": 12980
},
{
"entropy": 1.14453125,
"epoch": 1.7753177531775317,
"grad_norm": 0.0787087209140183,
"learning_rate": 5.794701986754967e-07,
"loss": 1.1545,
"mean_token_accuracy": 0.726427561044693,
"num_tokens": 91635272.0,
"step": 12990
},
{
"entropy": 1.154296875,
"epoch": 1.7766844335110017,
"grad_norm": 0.17616185605499443,
"learning_rate": 5.759475834859801e-07,
"loss": 1.1523,
"mean_token_accuracy": 0.7246049880981446,
"num_tokens": 92556954.0,
"step": 13000
},
{
"entropy": 1.14609375,
"epoch": 1.7780511138444717,
"grad_norm": 0.07413509019676823,
"learning_rate": 5.724249682964633e-07,
"loss": 1.1566,
"mean_token_accuracy": 0.7246738791465759,
"num_tokens": 93506877.0,
"step": 13010
},
{
"entropy": 1.102734375,
"epoch": 1.7794177941779417,
"grad_norm": 0.07048905269003047,
"learning_rate": 5.689023531069466e-07,
"loss": 1.0934,
"mean_token_accuracy": 0.7374719738960266,
"num_tokens": 94415912.0,
"step": 13020
},
{
"entropy": 1.1390625,
"epoch": 1.7807844745114116,
"grad_norm": 0.0776137722104166,
"learning_rate": 5.6537973791743e-07,
"loss": 1.1315,
"mean_token_accuracy": 0.7288609743118286,
"num_tokens": 95348520.0,
"step": 13030
},
{
"entropy": 1.1390625,
"epoch": 1.7821511548448816,
"grad_norm": 0.07647951185297519,
"learning_rate": 5.618571227279133e-07,
"loss": 1.1412,
"mean_token_accuracy": 0.7271883010864257,
"num_tokens": 96243169.0,
"step": 13040
},
{
"entropy": 1.131640625,
"epoch": 1.7835178351783518,
"grad_norm": 0.07694042890825886,
"learning_rate": 5.583345075383965e-07,
"loss": 1.1446,
"mean_token_accuracy": 0.725974029302597,
"num_tokens": 97165460.0,
"step": 13050
},
{
"entropy": 1.046875,
"epoch": 1.7848845155118218,
"grad_norm": 0.06956179482735987,
"learning_rate": 5.548118923488799e-07,
"loss": 1.0387,
"mean_token_accuracy": 0.7467240393161774,
"num_tokens": 98065606.0,
"step": 13060
},
{
"entropy": 1.121875,
"epoch": 1.7862511958452918,
"grad_norm": 0.07096672618119589,
"learning_rate": 5.512892771593632e-07,
"loss": 1.1135,
"mean_token_accuracy": 0.7334041059017181,
"num_tokens": 98944079.0,
"step": 13070
},
{
"entropy": 1.13125,
"epoch": 1.7876178761787618,
"grad_norm": 0.0830683985089812,
"learning_rate": 5.477666619698464e-07,
"loss": 1.1329,
"mean_token_accuracy": 0.7295370519161224,
"num_tokens": 99834242.0,
"step": 13080
},
{
"entropy": 1.140625,
"epoch": 1.7889845565122318,
"grad_norm": 0.06916956638032118,
"learning_rate": 5.442440467803298e-07,
"loss": 1.1438,
"mean_token_accuracy": 0.7268279492855072,
"num_tokens": 100769863.0,
"step": 13090
},
{
"entropy": 1.11796875,
"epoch": 1.7903512368457017,
"grad_norm": 0.07722676371398704,
"learning_rate": 5.407214315908131e-07,
"loss": 1.1178,
"mean_token_accuracy": 0.7297780394554139,
"num_tokens": 101666867.0,
"step": 13100
},
{
"entropy": 1.134375,
"epoch": 1.791717917179172,
"grad_norm": 0.07006169933014485,
"learning_rate": 5.371988164012964e-07,
"loss": 1.1333,
"mean_token_accuracy": 0.7271759212017059,
"num_tokens": 102624418.0,
"step": 13110
},
{
"entropy": 1.117578125,
"epoch": 1.793084597512642,
"grad_norm": 0.07902494923822832,
"learning_rate": 5.336762012117797e-07,
"loss": 1.1117,
"mean_token_accuracy": 0.7333077847957611,
"num_tokens": 103539237.0,
"step": 13120
},
{
"entropy": 1.124609375,
"epoch": 1.794451277846112,
"grad_norm": 0.07621472540626262,
"learning_rate": 5.30153586022263e-07,
"loss": 1.139,
"mean_token_accuracy": 0.7288267850875855,
"num_tokens": 104445626.0,
"step": 13130
},
{
"entropy": 1.13046875,
"epoch": 1.795817958179582,
"grad_norm": 0.0722349863798373,
"learning_rate": 5.266309708327462e-07,
"loss": 1.1195,
"mean_token_accuracy": 0.7312707245349884,
"num_tokens": 105373855.0,
"step": 13140
},
{
"entropy": 1.11015625,
"epoch": 1.7971846385130519,
"grad_norm": 0.0788988377989891,
"learning_rate": 5.231083556432296e-07,
"loss": 1.1107,
"mean_token_accuracy": 0.7326897978782654,
"num_tokens": 106302873.0,
"step": 13150
},
{
"entropy": 1.0734375,
"epoch": 1.7985513188465219,
"grad_norm": 0.10510048845869266,
"learning_rate": 5.195857404537129e-07,
"loss": 1.0773,
"mean_token_accuracy": 0.7380777657032013,
"num_tokens": 107297035.0,
"step": 13160
},
{
"entropy": 1.120703125,
"epoch": 1.7999179991799918,
"grad_norm": 0.08033674551436286,
"learning_rate": 5.160631252641961e-07,
"loss": 1.1106,
"mean_token_accuracy": 0.7351796805858613,
"num_tokens": 108205581.0,
"step": 13170
},
{
"entropy": 1.109375,
"epoch": 1.8012846795134618,
"grad_norm": 0.0820411764285678,
"learning_rate": 5.125405100746795e-07,
"loss": 1.1107,
"mean_token_accuracy": 0.7331698000431061,
"num_tokens": 109079757.0,
"step": 13180
},
{
"entropy": 1.129296875,
"epoch": 1.8026513598469318,
"grad_norm": 0.1591555923271676,
"learning_rate": 5.090178948851628e-07,
"loss": 1.1317,
"mean_token_accuracy": 0.7289198100566864,
"num_tokens": 110030170.0,
"step": 13190
},
{
"entropy": 1.12578125,
"epoch": 1.8040180401804018,
"grad_norm": 0.07421268627454716,
"learning_rate": 5.054952796956461e-07,
"loss": 1.1297,
"mean_token_accuracy": 0.7305142879486084,
"num_tokens": 110939835.0,
"step": 13200
},
{
"entropy": 1.14140625,
"epoch": 1.8053847205138718,
"grad_norm": 0.07725432220723107,
"learning_rate": 5.019726645061294e-07,
"loss": 1.1416,
"mean_token_accuracy": 0.7280949771404266,
"num_tokens": 111883244.0,
"step": 13210
},
{
"entropy": 1.1640625,
"epoch": 1.8067514008473418,
"grad_norm": 0.07615422101520564,
"learning_rate": 4.984500493166127e-07,
"loss": 1.1631,
"mean_token_accuracy": 0.7231028914451599,
"num_tokens": 112802431.0,
"step": 13220
},
{
"entropy": 1.0828125,
"epoch": 1.8081180811808117,
"grad_norm": 0.07083322560703073,
"learning_rate": 4.94927434127096e-07,
"loss": 1.0783,
"mean_token_accuracy": 0.739861536026001,
"num_tokens": 113687834.0,
"step": 13230
},
{
"entropy": 1.15703125,
"epoch": 1.8094847615142817,
"grad_norm": 0.07162016784202692,
"learning_rate": 4.914048189375794e-07,
"loss": 1.1565,
"mean_token_accuracy": 0.7254205524921418,
"num_tokens": 114605083.0,
"step": 13240
},
{
"entropy": 1.087109375,
"epoch": 1.8108514418477517,
"grad_norm": 0.07671995726095397,
"learning_rate": 4.878822037480626e-07,
"loss": 1.0858,
"mean_token_accuracy": 0.7398945152759552,
"num_tokens": 115546216.0,
"step": 13250
},
{
"entropy": 1.11171875,
"epoch": 1.8122181221812217,
"grad_norm": 0.07662083873080588,
"learning_rate": 4.843595885585459e-07,
"loss": 1.12,
"mean_token_accuracy": 0.7310764729976654,
"num_tokens": 116484736.0,
"step": 13260
},
{
"entropy": 1.1640625,
"epoch": 1.8135848025146917,
"grad_norm": 0.08164827253278123,
"learning_rate": 4.808369733690292e-07,
"loss": 1.164,
"mean_token_accuracy": 0.7238347053527832,
"num_tokens": 117415166.0,
"step": 13270
},
{
"entropy": 1.12109375,
"epoch": 1.8149514828481617,
"grad_norm": 0.07353923982339816,
"learning_rate": 4.773143581795125e-07,
"loss": 1.1239,
"mean_token_accuracy": 0.7313941299915314,
"num_tokens": 118349848.0,
"step": 13280
},
{
"entropy": 1.20234375,
"epoch": 1.8163181631816319,
"grad_norm": 0.08217078527263527,
"learning_rate": 4.7379174298999583e-07,
"loss": 1.213,
"mean_token_accuracy": 0.7152225196361541,
"num_tokens": 119246330.0,
"step": 13290
},
{
"entropy": 1.09453125,
"epoch": 1.8176848435151018,
"grad_norm": 0.070938970670868,
"learning_rate": 4.702691278004791e-07,
"loss": 1.1049,
"mean_token_accuracy": 0.7350168883800506,
"num_tokens": 120181319.0,
"step": 13300
},
{
"entropy": 1.1515625,
"epoch": 1.8190515238485718,
"grad_norm": 0.06902803545765711,
"learning_rate": 4.6674651261096245e-07,
"loss": 1.1491,
"mean_token_accuracy": 0.7268678367137908,
"num_tokens": 121135502.0,
"step": 13310
},
{
"entropy": 1.146875,
"epoch": 1.8204182041820418,
"grad_norm": 0.07526663232652449,
"learning_rate": 4.632238974214457e-07,
"loss": 1.1606,
"mean_token_accuracy": 0.7233072936534881,
"num_tokens": 122041051.0,
"step": 13320
},
{
"entropy": 1.12890625,
"epoch": 1.8217848845155118,
"grad_norm": 0.07772176869473787,
"learning_rate": 4.5970128223192907e-07,
"loss": 1.129,
"mean_token_accuracy": 0.7289291441440582,
"num_tokens": 122940371.0,
"step": 13330
},
{
"entropy": 1.14453125,
"epoch": 1.8231515648489818,
"grad_norm": 0.07414346878732386,
"learning_rate": 4.561786670424123e-07,
"loss": 1.1355,
"mean_token_accuracy": 0.7298039734363556,
"num_tokens": 123832828.0,
"step": 13340
},
{
"entropy": 1.10546875,
"epoch": 1.824518245182452,
"grad_norm": 0.07908579494723159,
"learning_rate": 4.5265605185289563e-07,
"loss": 1.111,
"mean_token_accuracy": 0.7335039913654328,
"num_tokens": 124782029.0,
"step": 13350
},
{
"entropy": 1.10078125,
"epoch": 1.825884925515922,
"grad_norm": 0.06678742133865134,
"learning_rate": 4.491334366633789e-07,
"loss": 1.0882,
"mean_token_accuracy": 0.7369766891002655,
"num_tokens": 125695844.0,
"step": 13360
},
{
"entropy": 1.134375,
"epoch": 1.827251605849392,
"grad_norm": 0.07427834630855967,
"learning_rate": 4.456108214738622e-07,
"loss": 1.1389,
"mean_token_accuracy": 0.7284801483154297,
"num_tokens": 126602844.0,
"step": 13370
},
{
"entropy": 1.120703125,
"epoch": 1.828618286182862,
"grad_norm": 0.07022119725721813,
"learning_rate": 4.4208820628434553e-07,
"loss": 1.1236,
"mean_token_accuracy": 0.7315959870815277,
"num_tokens": 127496479.0,
"step": 13380
},
{
"entropy": 1.1421875,
"epoch": 1.829984966516332,
"grad_norm": 0.09233788425922608,
"learning_rate": 4.385655910948288e-07,
"loss": 1.1399,
"mean_token_accuracy": 0.7279922604560852,
"num_tokens": 128390571.0,
"step": 13390
},
{
"entropy": 1.1109375,
"epoch": 1.831351646849802,
"grad_norm": 0.07553754633993282,
"learning_rate": 4.3504297590531215e-07,
"loss": 1.1183,
"mean_token_accuracy": 0.7319070160388946,
"num_tokens": 129324966.0,
"step": 13400
},
{
"entropy": 1.146875,
"epoch": 1.8327183271832719,
"grad_norm": 0.07492089615272955,
"learning_rate": 4.3152036071579543e-07,
"loss": 1.1474,
"mean_token_accuracy": 0.726864892244339,
"num_tokens": 130223833.0,
"step": 13410
},
{
"entropy": 1.15390625,
"epoch": 1.8340850075167419,
"grad_norm": 0.06418645265541575,
"learning_rate": 4.2799774552627877e-07,
"loss": 1.1544,
"mean_token_accuracy": 0.7264314651489258,
"num_tokens": 131152117.0,
"step": 13420
},
{
"entropy": 1.12734375,
"epoch": 1.8354516878502118,
"grad_norm": 0.07431581376423141,
"learning_rate": 4.2447513033676205e-07,
"loss": 1.1213,
"mean_token_accuracy": 0.7318638563156128,
"num_tokens": 132065998.0,
"step": 13430
},
{
"entropy": 1.176171875,
"epoch": 1.8368183681836818,
"grad_norm": 0.08044971853514662,
"learning_rate": 4.209525151472454e-07,
"loss": 1.1723,
"mean_token_accuracy": 0.7214906871318817,
"num_tokens": 132958023.0,
"step": 13440
},
{
"entropy": 1.1375,
"epoch": 1.8381850485171518,
"grad_norm": 0.07278478821335976,
"learning_rate": 4.1742989995772867e-07,
"loss": 1.1274,
"mean_token_accuracy": 0.7293699622154236,
"num_tokens": 133876787.0,
"step": 13450
},
{
"entropy": 1.095703125,
"epoch": 1.8395517288506218,
"grad_norm": 0.07998513464377081,
"learning_rate": 4.13907284768212e-07,
"loss": 1.0874,
"mean_token_accuracy": 0.7359326779842377,
"num_tokens": 134801760.0,
"step": 13460
},
{
"entropy": 1.13203125,
"epoch": 1.8409184091840918,
"grad_norm": 0.07224911260251957,
"learning_rate": 4.1038466957869523e-07,
"loss": 1.1369,
"mean_token_accuracy": 0.7277138769626618,
"num_tokens": 135734222.0,
"step": 13470
},
{
"entropy": 1.11953125,
"epoch": 1.8422850895175618,
"grad_norm": 0.07162483086482553,
"learning_rate": 4.068620543891785e-07,
"loss": 1.1203,
"mean_token_accuracy": 0.7336102902889252,
"num_tokens": 136645982.0,
"step": 13480
},
{
"entropy": 1.112109375,
"epoch": 1.8436517698510317,
"grad_norm": 0.0709946128371802,
"learning_rate": 4.0333943919966185e-07,
"loss": 1.1164,
"mean_token_accuracy": 0.7317050158977508,
"num_tokens": 137542267.0,
"step": 13490
},
{
"entropy": 1.1296875,
"epoch": 1.8450184501845017,
"grad_norm": 0.07629521250357801,
"learning_rate": 3.9981682401014513e-07,
"loss": 1.1214,
"mean_token_accuracy": 0.7309034943580628,
"num_tokens": 138430757.0,
"step": 13500
},
{
"entropy": 1.14375,
"epoch": 1.8463851305179717,
"grad_norm": 0.06582196810754139,
"learning_rate": 3.9629420882062847e-07,
"loss": 1.1496,
"mean_token_accuracy": 0.7272295594215393,
"num_tokens": 139388862.0,
"step": 13510
},
{
"entropy": 1.15390625,
"epoch": 1.8477518108514417,
"grad_norm": 0.0786349952821788,
"learning_rate": 3.9277159363111175e-07,
"loss": 1.1682,
"mean_token_accuracy": 0.7227471709251404,
"num_tokens": 140236398.0,
"step": 13520
},
{
"entropy": 1.095703125,
"epoch": 1.849118491184912,
"grad_norm": 0.07551653853480134,
"learning_rate": 3.892489784415951e-07,
"loss": 1.0984,
"mean_token_accuracy": 0.7342126131057739,
"num_tokens": 141122220.0,
"step": 13530
},
{
"entropy": 1.1265625,
"epoch": 1.8504851715183819,
"grad_norm": 0.07584207307118075,
"learning_rate": 3.8572636325207837e-07,
"loss": 1.1236,
"mean_token_accuracy": 0.7300493896007538,
"num_tokens": 142051017.0,
"step": 13540
},
{
"entropy": 1.11875,
"epoch": 1.8518518518518519,
"grad_norm": 0.07558390291894601,
"learning_rate": 3.822037480625617e-07,
"loss": 1.1147,
"mean_token_accuracy": 0.7308057963848114,
"num_tokens": 142919158.0,
"step": 13550
},
{
"entropy": 1.111328125,
"epoch": 1.8532185321853218,
"grad_norm": 0.07585670208959044,
"learning_rate": 3.78681132873045e-07,
"loss": 1.1046,
"mean_token_accuracy": 0.7335990965366364,
"num_tokens": 143824740.0,
"step": 13560
},
{
"entropy": 1.0859375,
"epoch": 1.8545852125187918,
"grad_norm": 0.07493365564618405,
"learning_rate": 3.7515851768352827e-07,
"loss": 1.0788,
"mean_token_accuracy": 0.7408087849617004,
"num_tokens": 144768182.0,
"step": 13570
},
{
"entropy": 1.13671875,
"epoch": 1.8559518928522618,
"grad_norm": 0.07557279464371736,
"learning_rate": 3.716359024940116e-07,
"loss": 1.1441,
"mean_token_accuracy": 0.7260900378227234,
"num_tokens": 145700745.0,
"step": 13580
},
{
"entropy": 1.13359375,
"epoch": 1.857318573185732,
"grad_norm": 0.07689914898701833,
"learning_rate": 3.6811328730449484e-07,
"loss": 1.1336,
"mean_token_accuracy": 0.7282991647720337,
"num_tokens": 146654090.0,
"step": 13590
},
{
"entropy": 1.1296875,
"epoch": 1.858685253519202,
"grad_norm": 0.0710360010638184,
"learning_rate": 3.645906721149782e-07,
"loss": 1.133,
"mean_token_accuracy": 0.7306958198547363,
"num_tokens": 147575414.0,
"step": 13600
},
{
"entropy": 1.13359375,
"epoch": 1.860051933852672,
"grad_norm": 0.07137313595656082,
"learning_rate": 3.6106805692546145e-07,
"loss": 1.1353,
"mean_token_accuracy": 0.7293071627616883,
"num_tokens": 148555252.0,
"step": 13610
},
{
"entropy": 1.1375,
"epoch": 1.861418614186142,
"grad_norm": 0.07256924228734654,
"learning_rate": 3.575454417359448e-07,
"loss": 1.1353,
"mean_token_accuracy": 0.7296614050865173,
"num_tokens": 149469104.0,
"step": 13620
},
{
"entropy": 1.1078125,
"epoch": 1.862785294519612,
"grad_norm": 0.07419061574944906,
"learning_rate": 3.5402282654642807e-07,
"loss": 1.1159,
"mean_token_accuracy": 0.732065349817276,
"num_tokens": 150403775.0,
"step": 13630
},
{
"entropy": 1.18984375,
"epoch": 1.864151974853082,
"grad_norm": 0.07521887026863565,
"learning_rate": 3.505002113569114e-07,
"loss": 1.1847,
"mean_token_accuracy": 0.7195768833160401,
"num_tokens": 151299096.0,
"step": 13640
},
{
"entropy": 1.121875,
"epoch": 1.865518655186552,
"grad_norm": 0.07359191824026247,
"learning_rate": 3.469775961673947e-07,
"loss": 1.1076,
"mean_token_accuracy": 0.7361898958683014,
"num_tokens": 152219490.0,
"step": 13650
},
{
"entropy": 1.09140625,
"epoch": 1.866885335520022,
"grad_norm": 0.08129486226193587,
"learning_rate": 3.43454980977878e-07,
"loss": 1.084,
"mean_token_accuracy": 0.7382171809673309,
"num_tokens": 153167398.0,
"step": 13660
},
{
"entropy": 1.13125,
"epoch": 1.8682520158534919,
"grad_norm": 0.07367072993599204,
"learning_rate": 3.399323657883613e-07,
"loss": 1.1231,
"mean_token_accuracy": 0.7307034134864807,
"num_tokens": 154073257.0,
"step": 13670
},
{
"entropy": 1.10625,
"epoch": 1.8696186961869619,
"grad_norm": 0.08717531410812251,
"learning_rate": 3.364097505988446e-07,
"loss": 1.103,
"mean_token_accuracy": 0.7338844358921051,
"num_tokens": 155032425.0,
"step": 13680
},
{
"entropy": 1.14765625,
"epoch": 1.8709853765204318,
"grad_norm": 0.07428613791378101,
"learning_rate": 3.328871354093279e-07,
"loss": 1.1454,
"mean_token_accuracy": 0.7267822623252869,
"num_tokens": 155938174.0,
"step": 13690
},
{
"entropy": 1.1140625,
"epoch": 1.8723520568539018,
"grad_norm": 0.06868344300995262,
"learning_rate": 3.293645202198112e-07,
"loss": 1.1062,
"mean_token_accuracy": 0.7346673369407654,
"num_tokens": 156871113.0,
"step": 13700
},
{
"entropy": 1.15078125,
"epoch": 1.8737187371873718,
"grad_norm": 0.07606479205198037,
"learning_rate": 3.2584190503029454e-07,
"loss": 1.1534,
"mean_token_accuracy": 0.7261383950710296,
"num_tokens": 157781512.0,
"step": 13710
},
{
"entropy": 1.105859375,
"epoch": 1.8750854175208418,
"grad_norm": 0.08386006391577279,
"learning_rate": 3.223192898407778e-07,
"loss": 1.1003,
"mean_token_accuracy": 0.7344890296459198,
"num_tokens": 158691360.0,
"step": 13720
},
{
"entropy": 1.08359375,
"epoch": 1.8764520978543118,
"grad_norm": 0.07520975805069129,
"learning_rate": 3.1879667465126116e-07,
"loss": 1.074,
"mean_token_accuracy": 0.7399497330188751,
"num_tokens": 159582543.0,
"step": 13730
},
{
"entropy": 1.1625,
"epoch": 1.8778187781877818,
"grad_norm": 0.08066196966154683,
"learning_rate": 3.152740594617444e-07,
"loss": 1.1817,
"mean_token_accuracy": 0.7202288568019867,
"num_tokens": 160558445.0,
"step": 13740
},
{
"entropy": 1.1171875,
"epoch": 1.8791854585212517,
"grad_norm": 0.08040030549778736,
"learning_rate": 3.117514442722277e-07,
"loss": 1.1024,
"mean_token_accuracy": 0.7343210577964783,
"num_tokens": 161423229.0,
"step": 13750
},
{
"entropy": 1.146875,
"epoch": 1.8805521388547217,
"grad_norm": 0.071484773216832,
"learning_rate": 3.08228829082711e-07,
"loss": 1.1471,
"mean_token_accuracy": 0.7277787506580353,
"num_tokens": 162367954.0,
"step": 13760
},
{
"entropy": 1.1375,
"epoch": 1.881918819188192,
"grad_norm": 0.07317287269493526,
"learning_rate": 3.0470621389319434e-07,
"loss": 1.1396,
"mean_token_accuracy": 0.726705151796341,
"num_tokens": 163255158.0,
"step": 13770
},
{
"entropy": 1.1265625,
"epoch": 1.883285499521662,
"grad_norm": 0.07826489311922706,
"learning_rate": 3.0118359870367763e-07,
"loss": 1.1106,
"mean_token_accuracy": 0.734573370218277,
"num_tokens": 164147726.0,
"step": 13780
},
{
"entropy": 1.078515625,
"epoch": 1.884652179855132,
"grad_norm": 0.07163235364506787,
"learning_rate": 2.9766098351416096e-07,
"loss": 1.0712,
"mean_token_accuracy": 0.7418512046337128,
"num_tokens": 165051383.0,
"step": 13790
},
{
"entropy": 1.145703125,
"epoch": 1.8860188601886019,
"grad_norm": 0.07499365372118767,
"learning_rate": 2.9413836832464424e-07,
"loss": 1.1312,
"mean_token_accuracy": 0.7280918836593628,
"num_tokens": 165994504.0,
"step": 13800
},
{
"entropy": 1.14375,
"epoch": 1.8873855405220719,
"grad_norm": 0.07606782210795555,
"learning_rate": 2.906157531351276e-07,
"loss": 1.1481,
"mean_token_accuracy": 0.7259750306606293,
"num_tokens": 166947789.0,
"step": 13810
},
{
"entropy": 1.1015625,
"epoch": 1.8887522208555418,
"grad_norm": 0.08343497521203319,
"learning_rate": 2.8709313794561086e-07,
"loss": 1.0962,
"mean_token_accuracy": 0.7361396133899689,
"num_tokens": 167854357.0,
"step": 13820
},
{
"entropy": 1.099609375,
"epoch": 1.890118901189012,
"grad_norm": 0.07731226549690087,
"learning_rate": 2.8357052275609415e-07,
"loss": 1.0981,
"mean_token_accuracy": 0.7333524227142334,
"num_tokens": 168751355.0,
"step": 13830
},
{
"entropy": 1.18046875,
"epoch": 1.891485581522482,
"grad_norm": 0.08115885929459135,
"learning_rate": 2.8004790756657743e-07,
"loss": 1.1685,
"mean_token_accuracy": 0.721680474281311,
"num_tokens": 169638746.0,
"step": 13840
},
{
"entropy": 1.14375,
"epoch": 1.892852261855952,
"grad_norm": 0.07893388257379971,
"learning_rate": 2.7652529237706076e-07,
"loss": 1.1426,
"mean_token_accuracy": 0.7263674914836884,
"num_tokens": 170598271.0,
"step": 13850
},
{
"entropy": 1.140625,
"epoch": 1.894218942189422,
"grad_norm": 0.07143505638985197,
"learning_rate": 2.7300267718754405e-07,
"loss": 1.1559,
"mean_token_accuracy": 0.7249770641326905,
"num_tokens": 171484862.0,
"step": 13860
},
{
"entropy": 1.1109375,
"epoch": 1.895585622522892,
"grad_norm": 0.06838153952793802,
"learning_rate": 2.694800619980274e-07,
"loss": 1.112,
"mean_token_accuracy": 0.7343595564365387,
"num_tokens": 172409180.0,
"step": 13870
},
{
"entropy": 1.0984375,
"epoch": 1.896952302856362,
"grad_norm": 0.073509802142004,
"learning_rate": 2.6595744680851066e-07,
"loss": 1.1098,
"mean_token_accuracy": 0.7333821773529052,
"num_tokens": 173310985.0,
"step": 13880
},
{
"entropy": 1.092578125,
"epoch": 1.898318983189832,
"grad_norm": 0.07623618039203878,
"learning_rate": 2.6243483161899395e-07,
"loss": 1.0841,
"mean_token_accuracy": 0.7371468424797059,
"num_tokens": 174254723.0,
"step": 13890
},
{
"entropy": 1.116015625,
"epoch": 1.899685663523302,
"grad_norm": 0.06565888691151385,
"learning_rate": 2.589122164294773e-07,
"loss": 1.1177,
"mean_token_accuracy": 0.7318998396396637,
"num_tokens": 175165940.0,
"step": 13900
},
{
"entropy": 1.15859375,
"epoch": 1.901052343856772,
"grad_norm": 0.0830915870992727,
"learning_rate": 2.5538960123996056e-07,
"loss": 1.1647,
"mean_token_accuracy": 0.7234176099300385,
"num_tokens": 176057007.0,
"step": 13910
},
{
"entropy": 1.148046875,
"epoch": 1.902419024190242,
"grad_norm": 0.06769553677459843,
"learning_rate": 2.518669860504439e-07,
"loss": 1.1572,
"mean_token_accuracy": 0.7260869801044464,
"num_tokens": 177011947.0,
"step": 13920
},
{
"entropy": 1.06484375,
"epoch": 1.9037857045237119,
"grad_norm": 0.0688777643887345,
"learning_rate": 2.483443708609272e-07,
"loss": 1.0569,
"mean_token_accuracy": 0.7414270460605621,
"num_tokens": 177924400.0,
"step": 13930
},
{
"entropy": 1.1109375,
"epoch": 1.9051523848571819,
"grad_norm": 0.0764686930259745,
"learning_rate": 2.4482175567141046e-07,
"loss": 1.1029,
"mean_token_accuracy": 0.7344212114810944,
"num_tokens": 178845001.0,
"step": 13940
},
{
"entropy": 1.103125,
"epoch": 1.9065190651906518,
"grad_norm": 0.07038869543271793,
"learning_rate": 2.4129914048189375e-07,
"loss": 1.1034,
"mean_token_accuracy": 0.7345105648040772,
"num_tokens": 179748324.0,
"step": 13950
},
{
"entropy": 1.1875,
"epoch": 1.9078857455241218,
"grad_norm": 0.07317764134005893,
"learning_rate": 2.3777652529237708e-07,
"loss": 1.188,
"mean_token_accuracy": 0.7192046761512756,
"num_tokens": 180662022.0,
"step": 13960
},
{
"entropy": 1.140625,
"epoch": 1.9092524258575918,
"grad_norm": 0.07749813798011279,
"learning_rate": 2.342539101028604e-07,
"loss": 1.1379,
"mean_token_accuracy": 0.7283846199512481,
"num_tokens": 181563942.0,
"step": 13970
},
{
"entropy": 1.1203125,
"epoch": 1.9106191061910618,
"grad_norm": 0.07515259023833258,
"learning_rate": 2.307312949133437e-07,
"loss": 1.1196,
"mean_token_accuracy": 0.7313680708408355,
"num_tokens": 182500356.0,
"step": 13980
},
{
"entropy": 1.120703125,
"epoch": 1.9119857865245318,
"grad_norm": 0.08429477669052571,
"learning_rate": 2.2720867972382698e-07,
"loss": 1.1244,
"mean_token_accuracy": 0.7298611521720886,
"num_tokens": 183445124.0,
"step": 13990
},
{
"entropy": 1.1265625,
"epoch": 1.9133524668580018,
"grad_norm": 0.06874130064369395,
"learning_rate": 2.236860645343103e-07,
"loss": 1.1165,
"mean_token_accuracy": 0.7325605511665344,
"num_tokens": 184396746.0,
"step": 14000
},
{
"entropy": 1.1359375,
"epoch": 1.914719147191472,
"grad_norm": 0.07044950453765171,
"learning_rate": 2.201634493447936e-07,
"loss": 1.1372,
"mean_token_accuracy": 0.7284309506416321,
"num_tokens": 185359281.0,
"step": 14010
},
{
"entropy": 1.1375,
"epoch": 1.916085827524942,
"grad_norm": 0.06986028875453026,
"learning_rate": 2.166408341552769e-07,
"loss": 1.1465,
"mean_token_accuracy": 0.7264468848705292,
"num_tokens": 186280196.0,
"step": 14020
},
{
"entropy": 1.1453125,
"epoch": 1.917452507858412,
"grad_norm": 0.07183053642998603,
"learning_rate": 2.131182189657602e-07,
"loss": 1.1631,
"mean_token_accuracy": 0.72248415350914,
"num_tokens": 187232100.0,
"step": 14030
},
{
"entropy": 1.119921875,
"epoch": 1.918819188191882,
"grad_norm": 0.07133054908276189,
"learning_rate": 2.095956037762435e-07,
"loss": 1.1185,
"mean_token_accuracy": 0.7333917677402496,
"num_tokens": 188154418.0,
"step": 14040
},
{
"entropy": 1.11171875,
"epoch": 1.920185868525352,
"grad_norm": 0.0732575365268055,
"learning_rate": 2.0607298858672678e-07,
"loss": 1.1137,
"mean_token_accuracy": 0.7339317739009857,
"num_tokens": 189090001.0,
"step": 14050
},
{
"entropy": 1.14375,
"epoch": 1.9215525488588219,
"grad_norm": 0.08399589592316369,
"learning_rate": 2.025503733972101e-07,
"loss": 1.1392,
"mean_token_accuracy": 0.7278505384922027,
"num_tokens": 190014866.0,
"step": 14060
},
{
"entropy": 1.14765625,
"epoch": 1.922919229192292,
"grad_norm": 0.07509979106821754,
"learning_rate": 1.990277582076934e-07,
"loss": 1.1446,
"mean_token_accuracy": 0.7268008768558503,
"num_tokens": 190917483.0,
"step": 14070
},
{
"entropy": 1.10078125,
"epoch": 1.924285909525762,
"grad_norm": 0.0715097192118739,
"learning_rate": 1.955051430181767e-07,
"loss": 1.0905,
"mean_token_accuracy": 0.7365378201007843,
"num_tokens": 191865929.0,
"step": 14080
},
{
"entropy": 1.153125,
"epoch": 1.925652589859232,
"grad_norm": 0.07703489886596376,
"learning_rate": 1.9198252782866002e-07,
"loss": 1.1614,
"mean_token_accuracy": 0.725530743598938,
"num_tokens": 192813435.0,
"step": 14090
},
{
"entropy": 1.124609375,
"epoch": 1.927019270192702,
"grad_norm": 0.07955610515594624,
"learning_rate": 1.8845991263914333e-07,
"loss": 1.131,
"mean_token_accuracy": 0.7302420377731323,
"num_tokens": 193726480.0,
"step": 14100
},
{
"entropy": 1.1359375,
"epoch": 1.928385950526172,
"grad_norm": 0.06913975394659269,
"learning_rate": 1.8493729744962664e-07,
"loss": 1.1378,
"mean_token_accuracy": 0.7286489546298981,
"num_tokens": 194707283.0,
"step": 14110
},
{
"entropy": 1.08515625,
"epoch": 1.929752630859642,
"grad_norm": 0.0703430847898186,
"learning_rate": 1.8141468226010995e-07,
"loss": 1.0813,
"mean_token_accuracy": 0.737841272354126,
"num_tokens": 195656150.0,
"step": 14120
},
{
"entropy": 1.10078125,
"epoch": 1.931119311193112,
"grad_norm": 0.07081629253506438,
"learning_rate": 1.778920670705932e-07,
"loss": 1.0931,
"mean_token_accuracy": 0.7355447232723236,
"num_tokens": 196550109.0,
"step": 14130
},
{
"entropy": 1.138671875,
"epoch": 1.932485991526582,
"grad_norm": 0.08125461975601851,
"learning_rate": 1.743694518810765e-07,
"loss": 1.1312,
"mean_token_accuracy": 0.729481703042984,
"num_tokens": 197476705.0,
"step": 14140
},
{
"entropy": 1.12265625,
"epoch": 1.933852671860052,
"grad_norm": 0.08049673794100225,
"learning_rate": 1.7084683669155982e-07,
"loss": 1.125,
"mean_token_accuracy": 0.7327687501907348,
"num_tokens": 198405164.0,
"step": 14150
},
{
"entropy": 1.14765625,
"epoch": 1.935219352193522,
"grad_norm": 0.08537236442516634,
"learning_rate": 1.6732422150204313e-07,
"loss": 1.1426,
"mean_token_accuracy": 0.7266921877861023,
"num_tokens": 199281622.0,
"step": 14160
},
{
"entropy": 1.137890625,
"epoch": 1.936586032526992,
"grad_norm": 0.07612096286222589,
"learning_rate": 1.6380160631252644e-07,
"loss": 1.1263,
"mean_token_accuracy": 0.7318998813629151,
"num_tokens": 200213447.0,
"step": 14170
},
{
"entropy": 1.15859375,
"epoch": 1.937952712860462,
"grad_norm": 0.06796127884328593,
"learning_rate": 1.6027899112300975e-07,
"loss": 1.141,
"mean_token_accuracy": 0.7270290791988373,
"num_tokens": 201132847.0,
"step": 14180
},
{
"entropy": 1.15234375,
"epoch": 1.9393193931939319,
"grad_norm": 0.07682838072078434,
"learning_rate": 1.5675637593349303e-07,
"loss": 1.1662,
"mean_token_accuracy": 0.7242652595043182,
"num_tokens": 202041330.0,
"step": 14190
},
{
"entropy": 1.13984375,
"epoch": 1.9406860735274019,
"grad_norm": 0.08067477287862707,
"learning_rate": 1.5323376074397634e-07,
"loss": 1.1505,
"mean_token_accuracy": 0.7245282888412475,
"num_tokens": 202927230.0,
"step": 14200
},
{
"entropy": 1.157421875,
"epoch": 1.9420527538608718,
"grad_norm": 0.07776899133148292,
"learning_rate": 1.4971114555445965e-07,
"loss": 1.176,
"mean_token_accuracy": 0.722199147939682,
"num_tokens": 203852110.0,
"step": 14210
},
{
"entropy": 1.103125,
"epoch": 1.9434194341943418,
"grad_norm": 0.06669296641385988,
"learning_rate": 1.4618853036494293e-07,
"loss": 1.0999,
"mean_token_accuracy": 0.7358080983161926,
"num_tokens": 204754873.0,
"step": 14220
},
{
"entropy": 1.16171875,
"epoch": 1.9447861145278118,
"grad_norm": 0.07264504928375268,
"learning_rate": 1.4266591517542624e-07,
"loss": 1.1681,
"mean_token_accuracy": 0.723861300945282,
"num_tokens": 205689965.0,
"step": 14230
},
{
"entropy": 1.1203125,
"epoch": 1.9461527948612818,
"grad_norm": 0.07328374586873804,
"learning_rate": 1.3914329998590955e-07,
"loss": 1.1203,
"mean_token_accuracy": 0.7319039106369019,
"num_tokens": 206650172.0,
"step": 14240
},
{
"entropy": 1.1515625,
"epoch": 1.947519475194752,
"grad_norm": 0.07159155882504922,
"learning_rate": 1.3562068479639286e-07,
"loss": 1.154,
"mean_token_accuracy": 0.7241881132125855,
"num_tokens": 207623109.0,
"step": 14250
},
{
"entropy": 1.14609375,
"epoch": 1.948886155528222,
"grad_norm": 0.065061224277872,
"learning_rate": 1.3209806960687614e-07,
"loss": 1.1456,
"mean_token_accuracy": 0.7267702043056488,
"num_tokens": 208577518.0,
"step": 14260
},
{
"entropy": 1.11796875,
"epoch": 1.950252835861692,
"grad_norm": 0.07608499287360124,
"learning_rate": 1.2857545441735945e-07,
"loss": 1.1301,
"mean_token_accuracy": 0.7286983072757721,
"num_tokens": 209533088.0,
"step": 14270
},
{
"entropy": 1.123046875,
"epoch": 1.951619516195162,
"grad_norm": 0.07075088905011669,
"learning_rate": 1.2505283922784276e-07,
"loss": 1.1247,
"mean_token_accuracy": 0.7306671798229217,
"num_tokens": 210472496.0,
"step": 14280
},
{
"entropy": 1.14140625,
"epoch": 1.952986196528632,
"grad_norm": 0.07344417157598446,
"learning_rate": 1.2153022403832607e-07,
"loss": 1.1369,
"mean_token_accuracy": 0.7289599597454071,
"num_tokens": 211347026.0,
"step": 14290
},
{
"entropy": 1.1140625,
"epoch": 1.954352876862102,
"grad_norm": 0.07825601495149453,
"learning_rate": 1.1800760884880937e-07,
"loss": 1.1257,
"mean_token_accuracy": 0.7295693159103394,
"num_tokens": 212245046.0,
"step": 14300
},
{
"entropy": 1.19609375,
"epoch": 1.9557195571955721,
"grad_norm": 0.08489708783530638,
"learning_rate": 1.1448499365929266e-07,
"loss": 1.2069,
"mean_token_accuracy": 0.7174946963787079,
"num_tokens": 213166602.0,
"step": 14310
},
{
"entropy": 1.1390625,
"epoch": 1.957086237529042,
"grad_norm": 0.07528852442691654,
"learning_rate": 1.1096237846977597e-07,
"loss": 1.1408,
"mean_token_accuracy": 0.7264128446578979,
"num_tokens": 214061122.0,
"step": 14320
},
{
"entropy": 1.128125,
"epoch": 1.958452917862512,
"grad_norm": 0.07232324326185426,
"learning_rate": 1.0743976328025928e-07,
"loss": 1.1259,
"mean_token_accuracy": 0.7319540560245514,
"num_tokens": 215007170.0,
"step": 14330
},
{
"entropy": 1.1453125,
"epoch": 1.959819598195982,
"grad_norm": 0.07271070941696796,
"learning_rate": 1.0391714809074258e-07,
"loss": 1.1512,
"mean_token_accuracy": 0.7270254790782928,
"num_tokens": 215916034.0,
"step": 14340
},
{
"entropy": 1.14921875,
"epoch": 1.961186278529452,
"grad_norm": 0.0701076422137375,
"learning_rate": 1.0039453290122588e-07,
"loss": 1.1528,
"mean_token_accuracy": 0.726936548948288,
"num_tokens": 216926531.0,
"step": 14350
},
{
"entropy": 1.153125,
"epoch": 1.962552958862922,
"grad_norm": 0.07888264488704483,
"learning_rate": 9.687191771170918e-08,
"loss": 1.168,
"mean_token_accuracy": 0.7223072111606598,
"num_tokens": 217853362.0,
"step": 14360
},
{
"entropy": 1.10546875,
"epoch": 1.963919639196392,
"grad_norm": 0.08152132341058567,
"learning_rate": 9.334930252219248e-08,
"loss": 1.1075,
"mean_token_accuracy": 0.7325962126255036,
"num_tokens": 218743465.0,
"step": 14370
},
{
"entropy": 1.1390625,
"epoch": 1.965286319529862,
"grad_norm": 0.07429846110345174,
"learning_rate": 8.982668733267578e-08,
"loss": 1.1257,
"mean_token_accuracy": 0.7281958818435669,
"num_tokens": 219649104.0,
"step": 14380
},
{
"entropy": 1.132421875,
"epoch": 1.966652999863332,
"grad_norm": 0.08010412838529529,
"learning_rate": 8.630407214315909e-08,
"loss": 1.1308,
"mean_token_accuracy": 0.7297826707363129,
"num_tokens": 220584028.0,
"step": 14390
},
{
"entropy": 1.1578125,
"epoch": 1.968019680196802,
"grad_norm": 0.06837412457370663,
"learning_rate": 8.27814569536424e-08,
"loss": 1.162,
"mean_token_accuracy": 0.7230808973312378,
"num_tokens": 221478507.0,
"step": 14400
},
{
"entropy": 1.146875,
"epoch": 1.969386360530272,
"grad_norm": 0.08153359909560123,
"learning_rate": 7.925884176412568e-08,
"loss": 1.1485,
"mean_token_accuracy": 0.7263343751430511,
"num_tokens": 222372101.0,
"step": 14410
},
{
"entropy": 1.12421875,
"epoch": 1.970753040863742,
"grad_norm": 0.07452392395396402,
"learning_rate": 7.573622657460899e-08,
"loss": 1.1293,
"mean_token_accuracy": 0.7323010504245758,
"num_tokens": 223299776.0,
"step": 14420
},
{
"entropy": 1.13984375,
"epoch": 1.972119721197212,
"grad_norm": 0.06542785382849328,
"learning_rate": 7.22136113850923e-08,
"loss": 1.1405,
"mean_token_accuracy": 0.7280698418617249,
"num_tokens": 224268136.0,
"step": 14430
},
{
"entropy": 1.13984375,
"epoch": 1.9734864015306819,
"grad_norm": 0.07121575712028956,
"learning_rate": 6.869099619557561e-08,
"loss": 1.1254,
"mean_token_accuracy": 0.73137948513031,
"num_tokens": 225212103.0,
"step": 14440
},
{
"entropy": 1.0953125,
"epoch": 1.9748530818641519,
"grad_norm": 0.07297115826409822,
"learning_rate": 6.51683810060589e-08,
"loss": 1.0921,
"mean_token_accuracy": 0.7361871302127838,
"num_tokens": 226179853.0,
"step": 14450
},
{
"entropy": 1.1296875,
"epoch": 1.9762197621976219,
"grad_norm": 0.07511042854260608,
"learning_rate": 6.164576581654221e-08,
"loss": 1.1235,
"mean_token_accuracy": 0.7303396761417389,
"num_tokens": 227077445.0,
"step": 14460
},
{
"entropy": 1.1421875,
"epoch": 1.9775864425310918,
"grad_norm": 0.0725298192604222,
"learning_rate": 5.8123150627025515e-08,
"loss": 1.1567,
"mean_token_accuracy": 0.7260671079158783,
"num_tokens": 227957010.0,
"step": 14470
},
{
"entropy": 1.153125,
"epoch": 1.9789531228645618,
"grad_norm": 0.06698134716670712,
"learning_rate": 5.460053543750881e-08,
"loss": 1.164,
"mean_token_accuracy": 0.7238839626312256,
"num_tokens": 228898359.0,
"step": 14480
},
{
"entropy": 1.07890625,
"epoch": 1.980319803198032,
"grad_norm": 0.0735384917891218,
"learning_rate": 5.107792024799211e-08,
"loss": 1.0838,
"mean_token_accuracy": 0.7381073951721191,
"num_tokens": 229796201.0,
"step": 14490
},
{
"entropy": 1.103125,
"epoch": 1.981686483531502,
"grad_norm": 0.06922903684507521,
"learning_rate": 4.7555305058475415e-08,
"loss": 1.1055,
"mean_token_accuracy": 0.7351616561412812,
"num_tokens": 230723005.0,
"step": 14500
},
{
"entropy": 1.1265625,
"epoch": 1.983053163864972,
"grad_norm": 0.07362621919118525,
"learning_rate": 4.403268986895872e-08,
"loss": 1.1241,
"mean_token_accuracy": 0.7303666710853577,
"num_tokens": 231614723.0,
"step": 14510
},
{
"entropy": 1.1609375,
"epoch": 1.984419844198442,
"grad_norm": 0.07732676026325312,
"learning_rate": 4.0510074679442026e-08,
"loss": 1.1587,
"mean_token_accuracy": 0.723632425069809,
"num_tokens": 232521569.0,
"step": 14520
},
{
"entropy": 1.15234375,
"epoch": 1.985786524531912,
"grad_norm": 0.0724194835417431,
"learning_rate": 3.698745948992532e-08,
"loss": 1.1539,
"mean_token_accuracy": 0.7254241943359375,
"num_tokens": 233451981.0,
"step": 14530
},
{
"entropy": 1.1359375,
"epoch": 1.987153204865382,
"grad_norm": 0.08053059666224277,
"learning_rate": 3.3464844300408624e-08,
"loss": 1.1443,
"mean_token_accuracy": 0.727936738729477,
"num_tokens": 234349422.0,
"step": 14540
},
{
"entropy": 1.1515625,
"epoch": 1.9885198851988521,
"grad_norm": 0.07838695742102439,
"learning_rate": 2.994222911089193e-08,
"loss": 1.1549,
"mean_token_accuracy": 0.7236394882202148,
"num_tokens": 235273744.0,
"step": 14550
},
{
"entropy": 1.1265625,
"epoch": 1.9898865655323221,
"grad_norm": 0.0742156330832039,
"learning_rate": 2.6419613921375232e-08,
"loss": 1.1362,
"mean_token_accuracy": 0.7293105900287629,
"num_tokens": 236223471.0,
"step": 14560
},
{
"entropy": 1.1515625,
"epoch": 1.991253245865792,
"grad_norm": 0.07370596853695563,
"learning_rate": 2.289699873185853e-08,
"loss": 1.1464,
"mean_token_accuracy": 0.7264924705028534,
"num_tokens": 237094812.0,
"step": 14570
},
{
"entropy": 1.105859375,
"epoch": 1.992619926199262,
"grad_norm": 0.07371837323810171,
"learning_rate": 1.9374383542341837e-08,
"loss": 1.1127,
"mean_token_accuracy": 0.7327429234981537,
"num_tokens": 238004049.0,
"step": 14580
},
{
"entropy": 1.118359375,
"epoch": 1.993986606532732,
"grad_norm": 0.07536256556000094,
"learning_rate": 1.585176835282514e-08,
"loss": 1.1168,
"mean_token_accuracy": 0.73252734541893,
"num_tokens": 238890267.0,
"step": 14590
},
{
"entropy": 1.1546875,
"epoch": 1.995353286866202,
"grad_norm": 0.06977575781002278,
"learning_rate": 1.2329153163308442e-08,
"loss": 1.1563,
"mean_token_accuracy": 0.7240585446357727,
"num_tokens": 239811788.0,
"step": 14600
},
{
"entropy": 1.095703125,
"epoch": 1.996719967199672,
"grad_norm": 0.06461937191554327,
"learning_rate": 8.806537973791744e-09,
"loss": 1.0867,
"mean_token_accuracy": 0.7370800852775574,
"num_tokens": 240716356.0,
"step": 14610
},
{
"entropy": 1.13359375,
"epoch": 1.998086647533142,
"grad_norm": 0.08060313905234695,
"learning_rate": 5.2839227842750465e-09,
"loss": 1.1483,
"mean_token_accuracy": 0.724536520242691,
"num_tokens": 241601601.0,
"step": 14620
},
{
"entropy": 1.175,
"epoch": 1.999453327866612,
"grad_norm": 0.07367983797646162,
"learning_rate": 1.7613075947583486e-09,
"loss": 1.1738,
"mean_token_accuracy": 0.7212517201900482,
"num_tokens": 242552770.0,
"step": 14630
},
{
"entropy": 1.130859375,
"epoch": 2.0,
"mean_token_accuracy": 0.7274957001209259,
"num_tokens": 242935223.0,
"step": 14634,
"total_flos": 2.19812273324032e+16,
"train_loss": 0.2031160936633208,
"train_runtime": 9356.8224,
"train_samples_per_second": 200.177,
"train_steps_per_second": 1.564
}
],
"logging_steps": 10,
"max_steps": 14634,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.19812273324032e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}