14677 lines
408 KiB
JSON
14677 lines
408 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 2.0,
|
|
"eval_steps": 500,
|
|
"global_step": 14634,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 1.953125,
|
|
"epoch": 0.0013666803334700013,
|
|
"grad_norm": 1.156074662903807,
|
|
"learning_rate": 1.0227272727272728e-07,
|
|
"loss": 2.2738,
|
|
"mean_token_accuracy": 0.5747356593608857,
|
|
"num_tokens": 937214.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 1.86953125,
|
|
"epoch": 0.0027333606669400026,
|
|
"grad_norm": 1.1415761540611056,
|
|
"learning_rate": 2.1590909090909094e-07,
|
|
"loss": 2.1592,
|
|
"mean_token_accuracy": 0.5934585154056549,
|
|
"num_tokens": 1832763.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 1.89765625,
|
|
"epoch": 0.004100041000410004,
|
|
"grad_norm": 1.2091395927654087,
|
|
"learning_rate": 3.2954545454545455e-07,
|
|
"loss": 2.2051,
|
|
"mean_token_accuracy": 0.585878336429596,
|
|
"num_tokens": 2759199.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 1.91015625,
|
|
"epoch": 0.005466721333880005,
|
|
"grad_norm": 1.1673440399030521,
|
|
"learning_rate": 4.431818181818182e-07,
|
|
"loss": 2.2024,
|
|
"mean_token_accuracy": 0.5851876974105835,
|
|
"num_tokens": 3670561.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 1.83203125,
|
|
"epoch": 0.006833401667350007,
|
|
"grad_norm": 1.5423114495382766,
|
|
"learning_rate": 5.568181818181818e-07,
|
|
"loss": 2.121,
|
|
"mean_token_accuracy": 0.5960965514183044,
|
|
"num_tokens": 4603768.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 1.9296875,
|
|
"epoch": 0.008200082000820008,
|
|
"grad_norm": 1.8686974484251,
|
|
"learning_rate": 6.704545454545456e-07,
|
|
"loss": 2.218,
|
|
"mean_token_accuracy": 0.5839235782623291,
|
|
"num_tokens": 5567048.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 1.94609375,
|
|
"epoch": 0.00956676233429001,
|
|
"grad_norm": 1.0753041199658435,
|
|
"learning_rate": 7.840909090909092e-07,
|
|
"loss": 2.2562,
|
|
"mean_token_accuracy": 0.5783323407173157,
|
|
"num_tokens": 6535603.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 1.96171875,
|
|
"epoch": 0.01093344266776001,
|
|
"grad_norm": 1.1444322902509063,
|
|
"learning_rate": 8.977272727272728e-07,
|
|
"loss": 2.2663,
|
|
"mean_token_accuracy": 0.5751403868198395,
|
|
"num_tokens": 7476241.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 1.925,
|
|
"epoch": 0.012300123001230012,
|
|
"grad_norm": 1.1123363381256088,
|
|
"learning_rate": 1.0113636363636365e-06,
|
|
"loss": 2.2119,
|
|
"mean_token_accuracy": 0.5831176042556763,
|
|
"num_tokens": 8346244.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 1.86328125,
|
|
"epoch": 0.013666803334700014,
|
|
"grad_norm": 1.4073068572684413,
|
|
"learning_rate": 1.125e-06,
|
|
"loss": 2.1696,
|
|
"mean_token_accuracy": 0.591303139925003,
|
|
"num_tokens": 9275008.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 1.88203125,
|
|
"epoch": 0.015033483668170014,
|
|
"grad_norm": 0.9809638754325662,
|
|
"learning_rate": 1.2386363636363638e-06,
|
|
"loss": 2.1507,
|
|
"mean_token_accuracy": 0.5913166284561158,
|
|
"num_tokens": 10215452.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 1.8671875,
|
|
"epoch": 0.016400164001640016,
|
|
"grad_norm": 1.437343829693697,
|
|
"learning_rate": 1.3522727272727273e-06,
|
|
"loss": 2.1556,
|
|
"mean_token_accuracy": 0.589393800497055,
|
|
"num_tokens": 11118084.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 1.90234375,
|
|
"epoch": 0.017766844335110017,
|
|
"grad_norm": 1.556848343860049,
|
|
"learning_rate": 1.465909090909091e-06,
|
|
"loss": 2.1795,
|
|
"mean_token_accuracy": 0.5893899947404861,
|
|
"num_tokens": 12052838.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 1.91015625,
|
|
"epoch": 0.01913352466858002,
|
|
"grad_norm": 1.2063323830036279,
|
|
"learning_rate": 1.5795454545454547e-06,
|
|
"loss": 2.1826,
|
|
"mean_token_accuracy": 0.5876123070716858,
|
|
"num_tokens": 12995411.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 1.88984375,
|
|
"epoch": 0.02050020500205002,
|
|
"grad_norm": 1.3050725378761945,
|
|
"learning_rate": 1.6931818181818182e-06,
|
|
"loss": 2.163,
|
|
"mean_token_accuracy": 0.5915959715843201,
|
|
"num_tokens": 13945408.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 1.925,
|
|
"epoch": 0.02186688533552002,
|
|
"grad_norm": 1.1881512510698542,
|
|
"learning_rate": 1.8068181818181822e-06,
|
|
"loss": 2.1825,
|
|
"mean_token_accuracy": 0.58701451420784,
|
|
"num_tokens": 14893066.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 1.92109375,
|
|
"epoch": 0.023233565668990024,
|
|
"grad_norm": 1.0269270514405306,
|
|
"learning_rate": 1.9204545454545457e-06,
|
|
"loss": 2.1786,
|
|
"mean_token_accuracy": 0.5874285280704499,
|
|
"num_tokens": 15751849.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 1.90234375,
|
|
"epoch": 0.024600246002460024,
|
|
"grad_norm": 1.3637455738801143,
|
|
"learning_rate": 2.034090909090909e-06,
|
|
"loss": 2.1371,
|
|
"mean_token_accuracy": 0.5907127439975739,
|
|
"num_tokens": 16663740.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 2.00234375,
|
|
"epoch": 0.025966926335930025,
|
|
"grad_norm": 0.7936147504618679,
|
|
"learning_rate": 2.147727272727273e-06,
|
|
"loss": 2.2286,
|
|
"mean_token_accuracy": 0.5772878587245941,
|
|
"num_tokens": 17618999.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 1.928125,
|
|
"epoch": 0.02733360666940003,
|
|
"grad_norm": 0.7915373253283765,
|
|
"learning_rate": 2.2613636363636366e-06,
|
|
"loss": 2.182,
|
|
"mean_token_accuracy": 0.5837074041366577,
|
|
"num_tokens": 18482611.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 1.89765625,
|
|
"epoch": 0.02870028700287003,
|
|
"grad_norm": 0.9395882500060874,
|
|
"learning_rate": 2.375e-06,
|
|
"loss": 2.1298,
|
|
"mean_token_accuracy": 0.5863896548748017,
|
|
"num_tokens": 19421505.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 1.92734375,
|
|
"epoch": 0.03006696733634003,
|
|
"grad_norm": 1.316700429051317,
|
|
"learning_rate": 2.488636363636364e-06,
|
|
"loss": 2.1214,
|
|
"mean_token_accuracy": 0.5915327668190002,
|
|
"num_tokens": 20362099.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 1.925,
|
|
"epoch": 0.03143364766981003,
|
|
"grad_norm": 0.8849106527296239,
|
|
"learning_rate": 2.6022727272727276e-06,
|
|
"loss": 2.1112,
|
|
"mean_token_accuracy": 0.5968846201896667,
|
|
"num_tokens": 21308408.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 1.9359375,
|
|
"epoch": 0.03280032800328003,
|
|
"grad_norm": 0.5673965460602998,
|
|
"learning_rate": 2.715909090909091e-06,
|
|
"loss": 2.1128,
|
|
"mean_token_accuracy": 0.5928859353065491,
|
|
"num_tokens": 22242562.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 1.95703125,
|
|
"epoch": 0.034167008336750036,
|
|
"grad_norm": 0.6774755617216568,
|
|
"learning_rate": 2.829545454545455e-06,
|
|
"loss": 2.1275,
|
|
"mean_token_accuracy": 0.5959916770458221,
|
|
"num_tokens": 23172299.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 1.9234375,
|
|
"epoch": 0.03553368867022003,
|
|
"grad_norm": 0.44492331880006164,
|
|
"learning_rate": 2.9431818181818185e-06,
|
|
"loss": 2.0806,
|
|
"mean_token_accuracy": 0.5997147262096405,
|
|
"num_tokens": 24105064.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 1.94140625,
|
|
"epoch": 0.03690036900369004,
|
|
"grad_norm": 0.6636533977768899,
|
|
"learning_rate": 3.056818181818182e-06,
|
|
"loss": 2.1079,
|
|
"mean_token_accuracy": 0.5958877563476562,
|
|
"num_tokens": 25035956.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 1.8921875,
|
|
"epoch": 0.03826704933716004,
|
|
"grad_norm": 1.4842253986652878,
|
|
"learning_rate": 3.1704545454545456e-06,
|
|
"loss": 2.0191,
|
|
"mean_token_accuracy": 0.6067247748374939,
|
|
"num_tokens": 25984258.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 1.92734375,
|
|
"epoch": 0.03963372967063004,
|
|
"grad_norm": 0.592639745815027,
|
|
"learning_rate": 3.2840909090909095e-06,
|
|
"loss": 2.0435,
|
|
"mean_token_accuracy": 0.6026173412799836,
|
|
"num_tokens": 26909845.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 1.9359375,
|
|
"epoch": 0.04100041000410004,
|
|
"grad_norm": 0.9617421335333151,
|
|
"learning_rate": 3.397727272727273e-06,
|
|
"loss": 2.077,
|
|
"mean_token_accuracy": 0.5966192781925201,
|
|
"num_tokens": 27832457.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 1.9890625,
|
|
"epoch": 0.042367090337570044,
|
|
"grad_norm": 1.0691537490209024,
|
|
"learning_rate": 3.5113636363636365e-06,
|
|
"loss": 2.1203,
|
|
"mean_token_accuracy": 0.5937976837158203,
|
|
"num_tokens": 28776617.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 1.96171875,
|
|
"epoch": 0.04373377067104004,
|
|
"grad_norm": 0.5622782093685751,
|
|
"learning_rate": 3.625e-06,
|
|
"loss": 2.0672,
|
|
"mean_token_accuracy": 0.5980514347553253,
|
|
"num_tokens": 29677253.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 2.02734375,
|
|
"epoch": 0.045100451004510045,
|
|
"grad_norm": 0.5546874472199492,
|
|
"learning_rate": 3.7386363636363635e-06,
|
|
"loss": 2.1326,
|
|
"mean_token_accuracy": 0.5920709252357483,
|
|
"num_tokens": 30592153.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 1.890625,
|
|
"epoch": 0.04646713133798005,
|
|
"grad_norm": 0.39140264754283716,
|
|
"learning_rate": 3.852272727272728e-06,
|
|
"loss": 1.9564,
|
|
"mean_token_accuracy": 0.6156416475772858,
|
|
"num_tokens": 31485625.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 1.8703125,
|
|
"epoch": 0.047833811671450045,
|
|
"grad_norm": 0.38409298172170403,
|
|
"learning_rate": 3.965909090909091e-06,
|
|
"loss": 1.931,
|
|
"mean_token_accuracy": 0.6187814712524414,
|
|
"num_tokens": 32433055.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 1.97578125,
|
|
"epoch": 0.04920049200492005,
|
|
"grad_norm": 0.44903467099204053,
|
|
"learning_rate": 4.079545454545455e-06,
|
|
"loss": 2.0545,
|
|
"mean_token_accuracy": 0.6000491321086884,
|
|
"num_tokens": 33370104.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 2.01015625,
|
|
"epoch": 0.05056717233839005,
|
|
"grad_norm": 1.3054045410944195,
|
|
"learning_rate": 4.193181818181819e-06,
|
|
"loss": 2.0752,
|
|
"mean_token_accuracy": 0.5999207854270935,
|
|
"num_tokens": 34301472.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 1.9859375,
|
|
"epoch": 0.05193385267186005,
|
|
"grad_norm": 0.5570279395450879,
|
|
"learning_rate": 4.306818181818182e-06,
|
|
"loss": 2.0573,
|
|
"mean_token_accuracy": 0.6014457881450653,
|
|
"num_tokens": 35207017.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 1.9265625,
|
|
"epoch": 0.05330053300533005,
|
|
"grad_norm": 0.47537207583750685,
|
|
"learning_rate": 4.420454545454546e-06,
|
|
"loss": 2.0093,
|
|
"mean_token_accuracy": 0.6086175203323364,
|
|
"num_tokens": 36167373.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 1.93203125,
|
|
"epoch": 0.05466721333880006,
|
|
"grad_norm": 0.2639566154682048,
|
|
"learning_rate": 4.53409090909091e-06,
|
|
"loss": 1.9745,
|
|
"mean_token_accuracy": 0.6148153007030487,
|
|
"num_tokens": 37057216.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 2.00546875,
|
|
"epoch": 0.05603389367227005,
|
|
"grad_norm": 0.40666228995985193,
|
|
"learning_rate": 4.647727272727273e-06,
|
|
"loss": 2.0551,
|
|
"mean_token_accuracy": 0.5984691202640533,
|
|
"num_tokens": 37960481.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 1.9703125,
|
|
"epoch": 0.05740057400574006,
|
|
"grad_norm": 0.3314237318555606,
|
|
"learning_rate": 4.761363636363637e-06,
|
|
"loss": 2.0381,
|
|
"mean_token_accuracy": 0.6050574481487274,
|
|
"num_tokens": 38906408.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 1.9359375,
|
|
"epoch": 0.05876725433921006,
|
|
"grad_norm": 0.4730505613218524,
|
|
"learning_rate": 4.875e-06,
|
|
"loss": 2.0002,
|
|
"mean_token_accuracy": 0.6102830052375794,
|
|
"num_tokens": 39848900.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 1.98515625,
|
|
"epoch": 0.06013393467268006,
|
|
"grad_norm": 0.3064285286308058,
|
|
"learning_rate": 4.988636363636364e-06,
|
|
"loss": 2.0453,
|
|
"mean_token_accuracy": 0.6044155418872833,
|
|
"num_tokens": 40804818.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 1.984375,
|
|
"epoch": 0.06150061500615006,
|
|
"grad_norm": 0.3304147800969379,
|
|
"learning_rate": 4.996829646329435e-06,
|
|
"loss": 2.0402,
|
|
"mean_token_accuracy": 0.6029874622821808,
|
|
"num_tokens": 41729062.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 1.9984375,
|
|
"epoch": 0.06286729533962006,
|
|
"grad_norm": 0.44212615152953016,
|
|
"learning_rate": 4.993307031139919e-06,
|
|
"loss": 2.0534,
|
|
"mean_token_accuracy": 0.6026692688465118,
|
|
"num_tokens": 42678992.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 1.97734375,
|
|
"epoch": 0.06423397567309007,
|
|
"grad_norm": 0.30866684028425856,
|
|
"learning_rate": 4.989784415950402e-06,
|
|
"loss": 2.0359,
|
|
"mean_token_accuracy": 0.6029385566711426,
|
|
"num_tokens": 43616340.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 2.07578125,
|
|
"epoch": 0.06560065600656007,
|
|
"grad_norm": 0.6643904522395835,
|
|
"learning_rate": 4.986261800760885e-06,
|
|
"loss": 2.1226,
|
|
"mean_token_accuracy": 0.5928886950016021,
|
|
"num_tokens": 44537225.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 1.9375,
|
|
"epoch": 0.06696733634003006,
|
|
"grad_norm": 0.3917462960500531,
|
|
"learning_rate": 4.9827391855713685e-06,
|
|
"loss": 1.9752,
|
|
"mean_token_accuracy": 0.6154713153839111,
|
|
"num_tokens": 45493989.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 1.953125,
|
|
"epoch": 0.06833401667350007,
|
|
"grad_norm": 0.6964684129171099,
|
|
"learning_rate": 4.979216570381852e-06,
|
|
"loss": 1.9693,
|
|
"mean_token_accuracy": 0.6102234065532685,
|
|
"num_tokens": 46418925.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 1.92109375,
|
|
"epoch": 0.06970069700697007,
|
|
"grad_norm": 0.3751909742184756,
|
|
"learning_rate": 4.975693955192335e-06,
|
|
"loss": 1.9679,
|
|
"mean_token_accuracy": 0.6087932586669922,
|
|
"num_tokens": 47332538.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 1.9625,
|
|
"epoch": 0.07106737734044007,
|
|
"grad_norm": 1.1145315382560543,
|
|
"learning_rate": 4.972171340002819e-06,
|
|
"loss": 1.9932,
|
|
"mean_token_accuracy": 0.6109839856624604,
|
|
"num_tokens": 48296164.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 1.9703125,
|
|
"epoch": 0.07243405767391008,
|
|
"grad_norm": 0.3241650987298432,
|
|
"learning_rate": 4.968648724813302e-06,
|
|
"loss": 1.9984,
|
|
"mean_token_accuracy": 0.6087324261665344,
|
|
"num_tokens": 49218895.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 1.89609375,
|
|
"epoch": 0.07380073800738007,
|
|
"grad_norm": 0.29169672651752254,
|
|
"learning_rate": 4.965126109623785e-06,
|
|
"loss": 1.9253,
|
|
"mean_token_accuracy": 0.6168661057949066,
|
|
"num_tokens": 50125282.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 1.96484375,
|
|
"epoch": 0.07516741834085007,
|
|
"grad_norm": 0.343214457363143,
|
|
"learning_rate": 4.961603494434268e-06,
|
|
"loss": 2.0148,
|
|
"mean_token_accuracy": 0.6083645164966583,
|
|
"num_tokens": 51051082.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 1.92890625,
|
|
"epoch": 0.07653409867432008,
|
|
"grad_norm": 0.3728200711781265,
|
|
"learning_rate": 4.958080879244752e-06,
|
|
"loss": 1.9568,
|
|
"mean_token_accuracy": 0.6143473744392395,
|
|
"num_tokens": 51997548.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 1.9234375,
|
|
"epoch": 0.07790077900779008,
|
|
"grad_norm": 0.5110596854931215,
|
|
"learning_rate": 4.954558264055234e-06,
|
|
"loss": 1.9509,
|
|
"mean_token_accuracy": 0.6153412580490112,
|
|
"num_tokens": 52882165.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 1.91796875,
|
|
"epoch": 0.07926745934126007,
|
|
"grad_norm": 0.8878858677063508,
|
|
"learning_rate": 4.951035648865719e-06,
|
|
"loss": 1.9286,
|
|
"mean_token_accuracy": 0.6202750265598297,
|
|
"num_tokens": 53779009.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 1.9109375,
|
|
"epoch": 0.08063413967473008,
|
|
"grad_norm": 0.644516541230191,
|
|
"learning_rate": 4.9475130336762015e-06,
|
|
"loss": 1.9388,
|
|
"mean_token_accuracy": 0.614998584985733,
|
|
"num_tokens": 54682802.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 1.98046875,
|
|
"epoch": 0.08200082000820008,
|
|
"grad_norm": 0.424575271637112,
|
|
"learning_rate": 4.943990418486685e-06,
|
|
"loss": 2.0095,
|
|
"mean_token_accuracy": 0.604308408498764,
|
|
"num_tokens": 55647295.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 1.92421875,
|
|
"epoch": 0.08336750034167008,
|
|
"grad_norm": 0.713483616518217,
|
|
"learning_rate": 4.9404678032971685e-06,
|
|
"loss": 1.9422,
|
|
"mean_token_accuracy": 0.6195296823978425,
|
|
"num_tokens": 56553238.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 1.8921875,
|
|
"epoch": 0.08473418067514009,
|
|
"grad_norm": 0.263177250626298,
|
|
"learning_rate": 4.936945188107651e-06,
|
|
"loss": 1.9186,
|
|
"mean_token_accuracy": 0.6223742008209229,
|
|
"num_tokens": 57497017.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 1.8203125,
|
|
"epoch": 0.08610086100861009,
|
|
"grad_norm": 0.5706600815092372,
|
|
"learning_rate": 4.933422572918135e-06,
|
|
"loss": 1.8307,
|
|
"mean_token_accuracy": 0.6339426577091217,
|
|
"num_tokens": 58398571.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 1.9015625,
|
|
"epoch": 0.08746754134208008,
|
|
"grad_norm": 0.5584420083829682,
|
|
"learning_rate": 4.929899957728618e-06,
|
|
"loss": 1.9226,
|
|
"mean_token_accuracy": 0.623866057395935,
|
|
"num_tokens": 59340910.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 1.9375,
|
|
"epoch": 0.08883422167555009,
|
|
"grad_norm": 0.3240942783311434,
|
|
"learning_rate": 4.926377342539102e-06,
|
|
"loss": 1.9383,
|
|
"mean_token_accuracy": 0.6189294815063476,
|
|
"num_tokens": 60257936.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 1.94453125,
|
|
"epoch": 0.09020090200902009,
|
|
"grad_norm": 0.4477252760297667,
|
|
"learning_rate": 4.922854727349585e-06,
|
|
"loss": 1.9585,
|
|
"mean_token_accuracy": 0.6141092300415039,
|
|
"num_tokens": 61186900.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 1.89609375,
|
|
"epoch": 0.09156758234249009,
|
|
"grad_norm": 0.2728408374005631,
|
|
"learning_rate": 4.919332112160068e-06,
|
|
"loss": 1.9093,
|
|
"mean_token_accuracy": 0.6209330260753632,
|
|
"num_tokens": 62144469.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 1.90546875,
|
|
"epoch": 0.0929342626759601,
|
|
"grad_norm": 0.3348021122762971,
|
|
"learning_rate": 4.915809496970551e-06,
|
|
"loss": 1.9078,
|
|
"mean_token_accuracy": 0.6204870104789734,
|
|
"num_tokens": 63095739.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 2.0015625,
|
|
"epoch": 0.0943009430094301,
|
|
"grad_norm": 0.40912903508425785,
|
|
"learning_rate": 4.912286881781035e-06,
|
|
"loss": 2.025,
|
|
"mean_token_accuracy": 0.6031952500343323,
|
|
"num_tokens": 64009291.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 1.89296875,
|
|
"epoch": 0.09566762334290009,
|
|
"grad_norm": 0.4501592299870401,
|
|
"learning_rate": 4.908764266591518e-06,
|
|
"loss": 1.8958,
|
|
"mean_token_accuracy": 0.6233786225318909,
|
|
"num_tokens": 64920620.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 1.89609375,
|
|
"epoch": 0.0970343036763701,
|
|
"grad_norm": 0.3588133807994301,
|
|
"learning_rate": 4.9052416514020015e-06,
|
|
"loss": 1.9001,
|
|
"mean_token_accuracy": 0.6225775361061097,
|
|
"num_tokens": 65831822.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 1.8671875,
|
|
"epoch": 0.0984009840098401,
|
|
"grad_norm": 0.3617744611180895,
|
|
"learning_rate": 4.901719036212484e-06,
|
|
"loss": 1.8888,
|
|
"mean_token_accuracy": 0.6239147305488586,
|
|
"num_tokens": 66798367.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 1.8484375,
|
|
"epoch": 0.0997676643433101,
|
|
"grad_norm": 0.3247286993144485,
|
|
"learning_rate": 4.898196421022968e-06,
|
|
"loss": 1.8658,
|
|
"mean_token_accuracy": 0.6280933737754821,
|
|
"num_tokens": 67730120.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 1.86875,
|
|
"epoch": 0.1011343446767801,
|
|
"grad_norm": 0.36875298171998644,
|
|
"learning_rate": 4.894673805833451e-06,
|
|
"loss": 1.8545,
|
|
"mean_token_accuracy": 0.6295511543750762,
|
|
"num_tokens": 68583554.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 1.92265625,
|
|
"epoch": 0.1025010250102501,
|
|
"grad_norm": 0.32575842322297305,
|
|
"learning_rate": 4.891151190643935e-06,
|
|
"loss": 1.9425,
|
|
"mean_token_accuracy": 0.6198048830032349,
|
|
"num_tokens": 69508375.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 1.8984375,
|
|
"epoch": 0.1038677053437201,
|
|
"grad_norm": 0.24871717700209986,
|
|
"learning_rate": 4.8876285754544175e-06,
|
|
"loss": 1.9054,
|
|
"mean_token_accuracy": 0.6221800804138183,
|
|
"num_tokens": 70460877.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 1.93828125,
|
|
"epoch": 0.10523438567719011,
|
|
"grad_norm": 0.2950440161289729,
|
|
"learning_rate": 4.884105960264901e-06,
|
|
"loss": 1.9603,
|
|
"mean_token_accuracy": 0.6135677635669708,
|
|
"num_tokens": 71360713.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 1.8359375,
|
|
"epoch": 0.1066010660106601,
|
|
"grad_norm": 0.27166968275342157,
|
|
"learning_rate": 4.880583345075385e-06,
|
|
"loss": 1.8387,
|
|
"mean_token_accuracy": 0.6299247324466706,
|
|
"num_tokens": 72284468.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 1.859375,
|
|
"epoch": 0.1079677463441301,
|
|
"grad_norm": 0.4379715739940116,
|
|
"learning_rate": 4.877060729885867e-06,
|
|
"loss": 1.8645,
|
|
"mean_token_accuracy": 0.626331228017807,
|
|
"num_tokens": 73207403.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 1.86875,
|
|
"epoch": 0.10933442667760011,
|
|
"grad_norm": 0.2739335762739454,
|
|
"learning_rate": 4.873538114696351e-06,
|
|
"loss": 1.8822,
|
|
"mean_token_accuracy": 0.6237033188343049,
|
|
"num_tokens": 74117008.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 1.83984375,
|
|
"epoch": 0.11070110701107011,
|
|
"grad_norm": 0.2106027507379127,
|
|
"learning_rate": 4.870015499506834e-06,
|
|
"loss": 1.8576,
|
|
"mean_token_accuracy": 0.6330011665821076,
|
|
"num_tokens": 75041835.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 1.90078125,
|
|
"epoch": 0.1120677873445401,
|
|
"grad_norm": 0.2419522628112473,
|
|
"learning_rate": 4.866492884317318e-06,
|
|
"loss": 1.9149,
|
|
"mean_token_accuracy": 0.6196782231330872,
|
|
"num_tokens": 75967197.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 1.84921875,
|
|
"epoch": 0.11343446767801012,
|
|
"grad_norm": 0.24849284410831587,
|
|
"learning_rate": 4.862970269127801e-06,
|
|
"loss": 1.851,
|
|
"mean_token_accuracy": 0.6320783793926239,
|
|
"num_tokens": 76877706.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 1.91015625,
|
|
"epoch": 0.11480114801148011,
|
|
"grad_norm": 0.21765213858530014,
|
|
"learning_rate": 4.859447653938284e-06,
|
|
"loss": 1.885,
|
|
"mean_token_accuracy": 0.6225167155265808,
|
|
"num_tokens": 77772764.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 1.93359375,
|
|
"epoch": 0.11616782834495011,
|
|
"grad_norm": 0.33131706878958495,
|
|
"learning_rate": 4.855925038748768e-06,
|
|
"loss": 1.9539,
|
|
"mean_token_accuracy": 0.6162404716014862,
|
|
"num_tokens": 78765823.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 1.90859375,
|
|
"epoch": 0.11753450867842012,
|
|
"grad_norm": 0.5774736629969314,
|
|
"learning_rate": 4.852402423559251e-06,
|
|
"loss": 1.9284,
|
|
"mean_token_accuracy": 0.6201462745666504,
|
|
"num_tokens": 79731424.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 1.9140625,
|
|
"epoch": 0.11890118901189012,
|
|
"grad_norm": 0.19665935166157775,
|
|
"learning_rate": 4.848879808369734e-06,
|
|
"loss": 1.9431,
|
|
"mean_token_accuracy": 0.6187787115573883,
|
|
"num_tokens": 80641448.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 1.8359375,
|
|
"epoch": 0.12026786934536011,
|
|
"grad_norm": 0.1954070223875274,
|
|
"learning_rate": 4.8453571931802175e-06,
|
|
"loss": 1.8237,
|
|
"mean_token_accuracy": 0.6337621748447418,
|
|
"num_tokens": 81585576.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 1.796875,
|
|
"epoch": 0.12163454967883013,
|
|
"grad_norm": 0.2527569339636289,
|
|
"learning_rate": 4.8418345779907e-06,
|
|
"loss": 1.8012,
|
|
"mean_token_accuracy": 0.6382579624652862,
|
|
"num_tokens": 82486429.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 1.9296875,
|
|
"epoch": 0.12300123001230012,
|
|
"grad_norm": 0.5445479550976874,
|
|
"learning_rate": 4.838311962801184e-06,
|
|
"loss": 1.9461,
|
|
"mean_token_accuracy": 0.616455215215683,
|
|
"num_tokens": 83433731.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 1.86796875,
|
|
"epoch": 0.12436791034577012,
|
|
"grad_norm": 0.45639343364122437,
|
|
"learning_rate": 4.834789347611667e-06,
|
|
"loss": 1.8727,
|
|
"mean_token_accuracy": 0.6262564241886139,
|
|
"num_tokens": 84309955.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 1.9265625,
|
|
"epoch": 0.12573459067924012,
|
|
"grad_norm": 0.21638329386780378,
|
|
"learning_rate": 4.831266732422151e-06,
|
|
"loss": 1.922,
|
|
"mean_token_accuracy": 0.6205068528652191,
|
|
"num_tokens": 85189827.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 1.7796875,
|
|
"epoch": 0.12710127101271013,
|
|
"grad_norm": 0.35127272548379235,
|
|
"learning_rate": 4.827744117232634e-06,
|
|
"loss": 1.774,
|
|
"mean_token_accuracy": 0.642277467250824,
|
|
"num_tokens": 86090998.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 1.78125,
|
|
"epoch": 0.12846795134618014,
|
|
"grad_norm": 0.25465105169774294,
|
|
"learning_rate": 4.824221502043117e-06,
|
|
"loss": 1.7789,
|
|
"mean_token_accuracy": 0.6374447882175446,
|
|
"num_tokens": 86994470.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 1.91015625,
|
|
"epoch": 0.12983463167965012,
|
|
"grad_norm": 0.2949622080582681,
|
|
"learning_rate": 4.820698886853601e-06,
|
|
"loss": 1.9014,
|
|
"mean_token_accuracy": 0.6184860646724701,
|
|
"num_tokens": 87975439.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 1.896875,
|
|
"epoch": 0.13120131201312013,
|
|
"grad_norm": 0.41755342885799007,
|
|
"learning_rate": 4.817176271664084e-06,
|
|
"loss": 1.905,
|
|
"mean_token_accuracy": 0.6179324686527252,
|
|
"num_tokens": 88914958.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 1.8234375,
|
|
"epoch": 0.13256799234659014,
|
|
"grad_norm": 0.30524059328211045,
|
|
"learning_rate": 4.813653656474567e-06,
|
|
"loss": 1.826,
|
|
"mean_token_accuracy": 0.6316073417663575,
|
|
"num_tokens": 89793788.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 1.8984375,
|
|
"epoch": 0.13393467268006012,
|
|
"grad_norm": 0.2552485197911862,
|
|
"learning_rate": 4.8101310412850505e-06,
|
|
"loss": 1.9016,
|
|
"mean_token_accuracy": 0.6190746843814849,
|
|
"num_tokens": 90705091.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 1.87421875,
|
|
"epoch": 0.13530135301353013,
|
|
"grad_norm": 0.6080989089623805,
|
|
"learning_rate": 4.806608426095534e-06,
|
|
"loss": 1.894,
|
|
"mean_token_accuracy": 0.6250736117362976,
|
|
"num_tokens": 91668098.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 1.828125,
|
|
"epoch": 0.13666803334700015,
|
|
"grad_norm": 0.3791540405115751,
|
|
"learning_rate": 4.803085810906017e-06,
|
|
"loss": 1.8293,
|
|
"mean_token_accuracy": 0.6317654013633728,
|
|
"num_tokens": 92592501.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 1.91015625,
|
|
"epoch": 0.13803471368047013,
|
|
"grad_norm": 0.23213773101297513,
|
|
"learning_rate": 4.7995631957165e-06,
|
|
"loss": 1.9111,
|
|
"mean_token_accuracy": 0.6172502279281616,
|
|
"num_tokens": 93544394.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 1.81484375,
|
|
"epoch": 0.13940139401394014,
|
|
"grad_norm": 0.3200893901943456,
|
|
"learning_rate": 4.796040580526984e-06,
|
|
"loss": 1.8165,
|
|
"mean_token_accuracy": 0.6365236341953278,
|
|
"num_tokens": 94471176.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 1.8421875,
|
|
"epoch": 0.14076807434741015,
|
|
"grad_norm": 0.41355775584103477,
|
|
"learning_rate": 4.792517965337467e-06,
|
|
"loss": 1.8567,
|
|
"mean_token_accuracy": 0.6299303054809571,
|
|
"num_tokens": 95355220.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 1.86875,
|
|
"epoch": 0.14213475468088013,
|
|
"grad_norm": 0.3640567772240355,
|
|
"learning_rate": 4.78899535014795e-06,
|
|
"loss": 1.8838,
|
|
"mean_token_accuracy": 0.6263083994388581,
|
|
"num_tokens": 96280654.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 1.84609375,
|
|
"epoch": 0.14350143501435014,
|
|
"grad_norm": 0.37696765540341987,
|
|
"learning_rate": 4.785472734958434e-06,
|
|
"loss": 1.8571,
|
|
"mean_token_accuracy": 0.6291616797447205,
|
|
"num_tokens": 97231720.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 1.89375,
|
|
"epoch": 0.14486811534782015,
|
|
"grad_norm": 0.2684036743152887,
|
|
"learning_rate": 4.781950119768916e-06,
|
|
"loss": 1.906,
|
|
"mean_token_accuracy": 0.6221611976623536,
|
|
"num_tokens": 98167971.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 1.7796875,
|
|
"epoch": 0.14623479568129014,
|
|
"grad_norm": 0.27709767555998865,
|
|
"learning_rate": 4.778427504579401e-06,
|
|
"loss": 1.7559,
|
|
"mean_token_accuracy": 0.642610365152359,
|
|
"num_tokens": 99103228.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 1.80625,
|
|
"epoch": 0.14760147601476015,
|
|
"grad_norm": 0.32458253211396765,
|
|
"learning_rate": 4.774904889389883e-06,
|
|
"loss": 1.8069,
|
|
"mean_token_accuracy": 0.6368334352970123,
|
|
"num_tokens": 100061129.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 1.8296875,
|
|
"epoch": 0.14896815634823016,
|
|
"grad_norm": 0.39449561851250126,
|
|
"learning_rate": 4.771382274200367e-06,
|
|
"loss": 1.8425,
|
|
"mean_token_accuracy": 0.6318296074867249,
|
|
"num_tokens": 100990836.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 1.88515625,
|
|
"epoch": 0.15033483668170014,
|
|
"grad_norm": 0.2907596018432501,
|
|
"learning_rate": 4.76785965901085e-06,
|
|
"loss": 1.884,
|
|
"mean_token_accuracy": 0.6224274873733521,
|
|
"num_tokens": 101950859.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 1.9640625,
|
|
"epoch": 0.15170151701517015,
|
|
"grad_norm": 0.3317376572761362,
|
|
"learning_rate": 4.764337043821333e-06,
|
|
"loss": 1.9769,
|
|
"mean_token_accuracy": 0.6110576212406158,
|
|
"num_tokens": 102863085.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 1.86171875,
|
|
"epoch": 0.15306819734864016,
|
|
"grad_norm": 0.27351040730683185,
|
|
"learning_rate": 4.760814428631817e-06,
|
|
"loss": 1.8625,
|
|
"mean_token_accuracy": 0.6253052651882172,
|
|
"num_tokens": 103778407.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 1.83671875,
|
|
"epoch": 0.15443487768211014,
|
|
"grad_norm": 0.39244972913454873,
|
|
"learning_rate": 4.7572918134423e-06,
|
|
"loss": 1.8223,
|
|
"mean_token_accuracy": 0.6316622078418732,
|
|
"num_tokens": 104663777.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 1.80859375,
|
|
"epoch": 0.15580155801558015,
|
|
"grad_norm": 0.42979201850379845,
|
|
"learning_rate": 4.753769198252783e-06,
|
|
"loss": 1.8199,
|
|
"mean_token_accuracy": 0.63526691198349,
|
|
"num_tokens": 105592651.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 1.89453125,
|
|
"epoch": 0.15716823834905017,
|
|
"grad_norm": 0.3327259441422559,
|
|
"learning_rate": 4.7502465830632665e-06,
|
|
"loss": 1.9028,
|
|
"mean_token_accuracy": 0.619716739654541,
|
|
"num_tokens": 106559758.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 1.77890625,
|
|
"epoch": 0.15853491868252015,
|
|
"grad_norm": 0.3536337354395752,
|
|
"learning_rate": 4.74672396787375e-06,
|
|
"loss": 1.7708,
|
|
"mean_token_accuracy": 0.6424124717712403,
|
|
"num_tokens": 107497578.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 1.86015625,
|
|
"epoch": 0.15990159901599016,
|
|
"grad_norm": 0.26581485559804846,
|
|
"learning_rate": 4.743201352684233e-06,
|
|
"loss": 1.8677,
|
|
"mean_token_accuracy": 0.6305820763111114,
|
|
"num_tokens": 108435950.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 1.825,
|
|
"epoch": 0.16126827934946017,
|
|
"grad_norm": 0.3995395932984954,
|
|
"learning_rate": 4.739678737494716e-06,
|
|
"loss": 1.8302,
|
|
"mean_token_accuracy": 0.6358943402767181,
|
|
"num_tokens": 109342047.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 1.85859375,
|
|
"epoch": 0.16263495968293015,
|
|
"grad_norm": 0.3408339261253478,
|
|
"learning_rate": 4.7361561223052e-06,
|
|
"loss": 1.8647,
|
|
"mean_token_accuracy": 0.6233642816543579,
|
|
"num_tokens": 110258147.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 1.8375,
|
|
"epoch": 0.16400164001640016,
|
|
"grad_norm": 0.3861144904865381,
|
|
"learning_rate": 4.7326335071156834e-06,
|
|
"loss": 1.8581,
|
|
"mean_token_accuracy": 0.6320093750953675,
|
|
"num_tokens": 111168195.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 1.8375,
|
|
"epoch": 0.16536832034987017,
|
|
"grad_norm": 0.2806654210756302,
|
|
"learning_rate": 4.729110891926166e-06,
|
|
"loss": 1.8508,
|
|
"mean_token_accuracy": 0.6285846590995788,
|
|
"num_tokens": 112116927.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 1.80078125,
|
|
"epoch": 0.16673500068334016,
|
|
"grad_norm": 0.35836599397820645,
|
|
"learning_rate": 4.72558827673665e-06,
|
|
"loss": 1.8038,
|
|
"mean_token_accuracy": 0.6358201742172241,
|
|
"num_tokens": 112999719.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 1.79453125,
|
|
"epoch": 0.16810168101681017,
|
|
"grad_norm": 0.23369739822733626,
|
|
"learning_rate": 4.722065661547132e-06,
|
|
"loss": 1.8217,
|
|
"mean_token_accuracy": 0.6350409150123596,
|
|
"num_tokens": 113933820.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 1.88515625,
|
|
"epoch": 0.16946836135028018,
|
|
"grad_norm": 0.35195297674642667,
|
|
"learning_rate": 4.718543046357617e-06,
|
|
"loss": 1.8933,
|
|
"mean_token_accuracy": 0.6232813775539399,
|
|
"num_tokens": 114897301.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 1.7859375,
|
|
"epoch": 0.17083504168375016,
|
|
"grad_norm": 0.22802199439555407,
|
|
"learning_rate": 4.7150204311680995e-06,
|
|
"loss": 1.8018,
|
|
"mean_token_accuracy": 0.6346184015274048,
|
|
"num_tokens": 115771994.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 1.859375,
|
|
"epoch": 0.17220172201722017,
|
|
"grad_norm": 0.34182705251752815,
|
|
"learning_rate": 4.711497815978583e-06,
|
|
"loss": 1.8679,
|
|
"mean_token_accuracy": 0.6279147267341614,
|
|
"num_tokens": 116695488.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 1.87578125,
|
|
"epoch": 0.17356840235069018,
|
|
"grad_norm": 0.44835090249164583,
|
|
"learning_rate": 4.707975200789066e-06,
|
|
"loss": 1.9076,
|
|
"mean_token_accuracy": 0.6198844909667969,
|
|
"num_tokens": 117662755.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 1.83671875,
|
|
"epoch": 0.17493508268416016,
|
|
"grad_norm": 0.3871612644536388,
|
|
"learning_rate": 4.704452585599549e-06,
|
|
"loss": 1.8467,
|
|
"mean_token_accuracy": 0.6269859313964844,
|
|
"num_tokens": 118580509.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 1.81015625,
|
|
"epoch": 0.17630176301763018,
|
|
"grad_norm": 0.3484744014616231,
|
|
"learning_rate": 4.700929970410033e-06,
|
|
"loss": 1.8087,
|
|
"mean_token_accuracy": 0.636076134443283,
|
|
"num_tokens": 119504334.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 1.85,
|
|
"epoch": 0.17766844335110019,
|
|
"grad_norm": 0.24240507290961577,
|
|
"learning_rate": 4.697407355220516e-06,
|
|
"loss": 1.836,
|
|
"mean_token_accuracy": 0.6334590673446655,
|
|
"num_tokens": 120461659.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 1.83125,
|
|
"epoch": 0.17903512368457017,
|
|
"grad_norm": 0.1918803656238372,
|
|
"learning_rate": 4.693884740030999e-06,
|
|
"loss": 1.8232,
|
|
"mean_token_accuracy": 0.6308600008487701,
|
|
"num_tokens": 121333653.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 1.86796875,
|
|
"epoch": 0.18040180401804018,
|
|
"grad_norm": 0.23709804351511432,
|
|
"learning_rate": 4.690362124841483e-06,
|
|
"loss": 1.8616,
|
|
"mean_token_accuracy": 0.6281003952026367,
|
|
"num_tokens": 122256643.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 1.8515625,
|
|
"epoch": 0.1817684843515102,
|
|
"grad_norm": 0.20041039571745067,
|
|
"learning_rate": 4.686839509651966e-06,
|
|
"loss": 1.8679,
|
|
"mean_token_accuracy": 0.6262637794017791,
|
|
"num_tokens": 123180454.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 1.8671875,
|
|
"epoch": 0.18313516468498017,
|
|
"grad_norm": 0.226390288337461,
|
|
"learning_rate": 4.683316894462449e-06,
|
|
"loss": 1.8846,
|
|
"mean_token_accuracy": 0.6235682606697083,
|
|
"num_tokens": 124122334.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 1.79453125,
|
|
"epoch": 0.18450184501845018,
|
|
"grad_norm": 0.2543415135551604,
|
|
"learning_rate": 4.679794279272933e-06,
|
|
"loss": 1.8094,
|
|
"mean_token_accuracy": 0.6340814292430877,
|
|
"num_tokens": 125048955.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 1.81171875,
|
|
"epoch": 0.1858685253519202,
|
|
"grad_norm": 0.2018140545095807,
|
|
"learning_rate": 4.676271664083416e-06,
|
|
"loss": 1.8334,
|
|
"mean_token_accuracy": 0.6313010334968567,
|
|
"num_tokens": 126002830.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 1.9453125,
|
|
"epoch": 0.18723520568539018,
|
|
"grad_norm": 0.3535118734375432,
|
|
"learning_rate": 4.6727490488938995e-06,
|
|
"loss": 1.9365,
|
|
"mean_token_accuracy": 0.6178468406200409,
|
|
"num_tokens": 126878869.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 1.825,
|
|
"epoch": 0.1886018860188602,
|
|
"grad_norm": 0.32790914939396815,
|
|
"learning_rate": 4.669226433704382e-06,
|
|
"loss": 1.8176,
|
|
"mean_token_accuracy": 0.63253732919693,
|
|
"num_tokens": 127801086.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 1.7984375,
|
|
"epoch": 0.1899685663523302,
|
|
"grad_norm": 0.32261311653175506,
|
|
"learning_rate": 4.665703818514866e-06,
|
|
"loss": 1.8084,
|
|
"mean_token_accuracy": 0.6365336775779724,
|
|
"num_tokens": 128701558.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 1.80078125,
|
|
"epoch": 0.19133524668580018,
|
|
"grad_norm": 0.2621147682199955,
|
|
"learning_rate": 4.6621812033253484e-06,
|
|
"loss": 1.8135,
|
|
"mean_token_accuracy": 0.6351368725299835,
|
|
"num_tokens": 129623697.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.1927019270192702,
|
|
"grad_norm": 0.33489161716691607,
|
|
"learning_rate": 4.658658588135833e-06,
|
|
"loss": 1.699,
|
|
"mean_token_accuracy": 0.6538128435611725,
|
|
"num_tokens": 130545471.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 1.775,
|
|
"epoch": 0.1940686073527402,
|
|
"grad_norm": 0.33422379660991025,
|
|
"learning_rate": 4.6551359729463155e-06,
|
|
"loss": 1.7499,
|
|
"mean_token_accuracy": 0.6434505224227905,
|
|
"num_tokens": 131531564.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 1.88515625,
|
|
"epoch": 0.19543528768621019,
|
|
"grad_norm": 0.26038523873893715,
|
|
"learning_rate": 4.651613357756799e-06,
|
|
"loss": 1.8964,
|
|
"mean_token_accuracy": 0.6192707896232605,
|
|
"num_tokens": 132461283.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 1.846875,
|
|
"epoch": 0.1968019680196802,
|
|
"grad_norm": 0.3346254705024374,
|
|
"learning_rate": 4.648090742567283e-06,
|
|
"loss": 1.8753,
|
|
"mean_token_accuracy": 0.6269659161567688,
|
|
"num_tokens": 133396786.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 1.8578125,
|
|
"epoch": 0.1981686483531502,
|
|
"grad_norm": 0.38407529148595315,
|
|
"learning_rate": 4.644568127377765e-06,
|
|
"loss": 1.8745,
|
|
"mean_token_accuracy": 0.6260507881641388,
|
|
"num_tokens": 134351633.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 1.875,
|
|
"epoch": 0.1995353286866202,
|
|
"grad_norm": 0.39392845896245904,
|
|
"learning_rate": 4.641045512188249e-06,
|
|
"loss": 1.8899,
|
|
"mean_token_accuracy": 0.6247247517108917,
|
|
"num_tokens": 135284465.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 1.8390625,
|
|
"epoch": 0.2009020090200902,
|
|
"grad_norm": 0.23447333417955588,
|
|
"learning_rate": 4.6375228969987324e-06,
|
|
"loss": 1.8525,
|
|
"mean_token_accuracy": 0.629701840877533,
|
|
"num_tokens": 136227294.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 1.85234375,
|
|
"epoch": 0.2022686893535602,
|
|
"grad_norm": 0.2957687479767175,
|
|
"learning_rate": 4.634000281809216e-06,
|
|
"loss": 1.857,
|
|
"mean_token_accuracy": 0.6293111622333527,
|
|
"num_tokens": 137171699.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 1.81953125,
|
|
"epoch": 0.2036353696870302,
|
|
"grad_norm": 0.18848121613630578,
|
|
"learning_rate": 4.630477666619699e-06,
|
|
"loss": 1.8284,
|
|
"mean_token_accuracy": 0.6312909543514251,
|
|
"num_tokens": 138076595.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 1.71796875,
|
|
"epoch": 0.2050020500205002,
|
|
"grad_norm": 0.2477385794680005,
|
|
"learning_rate": 4.626955051430182e-06,
|
|
"loss": 1.7221,
|
|
"mean_token_accuracy": 0.6516807198524475,
|
|
"num_tokens": 139031966.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 1.83984375,
|
|
"epoch": 0.20636873035397021,
|
|
"grad_norm": 0.3725235185634643,
|
|
"learning_rate": 4.623432436240665e-06,
|
|
"loss": 1.8392,
|
|
"mean_token_accuracy": 0.6304112911224365,
|
|
"num_tokens": 139895441.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 1.765625,
|
|
"epoch": 0.2077354106874402,
|
|
"grad_norm": 0.21848393751211545,
|
|
"learning_rate": 4.619909821051149e-06,
|
|
"loss": 1.7615,
|
|
"mean_token_accuracy": 0.6417655706405639,
|
|
"num_tokens": 140756213.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 1.8296875,
|
|
"epoch": 0.2091020910209102,
|
|
"grad_norm": 0.32303786338626456,
|
|
"learning_rate": 4.616387205861632e-06,
|
|
"loss": 1.8471,
|
|
"mean_token_accuracy": 0.6305793225765228,
|
|
"num_tokens": 141671513.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 1.821875,
|
|
"epoch": 0.21046877135438022,
|
|
"grad_norm": 0.35264002827366286,
|
|
"learning_rate": 4.6128645906721156e-06,
|
|
"loss": 1.8206,
|
|
"mean_token_accuracy": 0.6346014618873597,
|
|
"num_tokens": 142551214.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 1.91328125,
|
|
"epoch": 0.2118354516878502,
|
|
"grad_norm": 0.3141906313142882,
|
|
"learning_rate": 4.609341975482598e-06,
|
|
"loss": 1.9199,
|
|
"mean_token_accuracy": 0.621783584356308,
|
|
"num_tokens": 143459546.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 1.81875,
|
|
"epoch": 0.2132021320213202,
|
|
"grad_norm": 0.4362886885521019,
|
|
"learning_rate": 4.605819360293082e-06,
|
|
"loss": 1.8209,
|
|
"mean_token_accuracy": 0.6339336514472962,
|
|
"num_tokens": 144418777.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 1.76875,
|
|
"epoch": 0.21456881235479022,
|
|
"grad_norm": 0.21696109825536186,
|
|
"learning_rate": 4.602296745103565e-06,
|
|
"loss": 1.7612,
|
|
"mean_token_accuracy": 0.643210518360138,
|
|
"num_tokens": 145335406.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 1.8515625,
|
|
"epoch": 0.2159354926882602,
|
|
"grad_norm": 0.26420040053878757,
|
|
"learning_rate": 4.598774129914049e-06,
|
|
"loss": 1.8915,
|
|
"mean_token_accuracy": 0.6252532303333282,
|
|
"num_tokens": 146258382.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 1.809375,
|
|
"epoch": 0.21730217302173022,
|
|
"grad_norm": 0.21998640370768963,
|
|
"learning_rate": 4.595251514724532e-06,
|
|
"loss": 1.8039,
|
|
"mean_token_accuracy": 0.6314591586589813,
|
|
"num_tokens": 147161843.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 1.8046875,
|
|
"epoch": 0.21866885335520023,
|
|
"grad_norm": 0.26344390168944803,
|
|
"learning_rate": 4.591728899535015e-06,
|
|
"loss": 1.8069,
|
|
"mean_token_accuracy": 0.6349209547042847,
|
|
"num_tokens": 148082940.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 1.75390625,
|
|
"epoch": 0.2200355336886702,
|
|
"grad_norm": 0.23300943167977073,
|
|
"learning_rate": 4.588206284345499e-06,
|
|
"loss": 1.756,
|
|
"mean_token_accuracy": 0.6418513536453248,
|
|
"num_tokens": 149010194.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 1.93125,
|
|
"epoch": 0.22140221402214022,
|
|
"grad_norm": 0.5188339637009438,
|
|
"learning_rate": 4.584683669155981e-06,
|
|
"loss": 1.9434,
|
|
"mean_token_accuracy": 0.6162494361400604,
|
|
"num_tokens": 149954447.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 1.84296875,
|
|
"epoch": 0.22276889435561023,
|
|
"grad_norm": 0.36162057662024294,
|
|
"learning_rate": 4.581161053966465e-06,
|
|
"loss": 1.8599,
|
|
"mean_token_accuracy": 0.6299505174160004,
|
|
"num_tokens": 150936740.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 1.89921875,
|
|
"epoch": 0.2241355746890802,
|
|
"grad_norm": 0.2121839788254721,
|
|
"learning_rate": 4.5776384387769485e-06,
|
|
"loss": 1.9323,
|
|
"mean_token_accuracy": 0.6188778698444366,
|
|
"num_tokens": 151871506.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 1.80625,
|
|
"epoch": 0.22550225502255022,
|
|
"grad_norm": 0.2421746658392603,
|
|
"learning_rate": 4.574115823587432e-06,
|
|
"loss": 1.7821,
|
|
"mean_token_accuracy": 0.6405410230159759,
|
|
"num_tokens": 152802237.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.22686893535602023,
|
|
"grad_norm": 0.42013570116499244,
|
|
"learning_rate": 4.570593208397915e-06,
|
|
"loss": 1.7945,
|
|
"mean_token_accuracy": 0.6374772906303405,
|
|
"num_tokens": 153727772.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 1.81796875,
|
|
"epoch": 0.22823561568949022,
|
|
"grad_norm": 0.46424346133381966,
|
|
"learning_rate": 4.567070593208398e-06,
|
|
"loss": 1.8114,
|
|
"mean_token_accuracy": 0.6361850202083588,
|
|
"num_tokens": 154631335.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 1.78515625,
|
|
"epoch": 0.22960229602296023,
|
|
"grad_norm": 0.2709493288839263,
|
|
"learning_rate": 4.563547978018882e-06,
|
|
"loss": 1.7931,
|
|
"mean_token_accuracy": 0.6348251700401306,
|
|
"num_tokens": 155568966.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.23096897635643024,
|
|
"grad_norm": 0.36740644311655907,
|
|
"learning_rate": 4.560025362829365e-06,
|
|
"loss": 1.7694,
|
|
"mean_token_accuracy": 0.637085211277008,
|
|
"num_tokens": 156479013.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 1.796875,
|
|
"epoch": 0.23233565668990022,
|
|
"grad_norm": 0.2593561048922035,
|
|
"learning_rate": 4.556502747639848e-06,
|
|
"loss": 1.8147,
|
|
"mean_token_accuracy": 0.6352080404758453,
|
|
"num_tokens": 157416329.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 1.8984375,
|
|
"epoch": 0.23370233702337023,
|
|
"grad_norm": 0.21933151325081754,
|
|
"learning_rate": 4.552980132450332e-06,
|
|
"loss": 1.8928,
|
|
"mean_token_accuracy": 0.6207044363021851,
|
|
"num_tokens": 158385767.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 1.78515625,
|
|
"epoch": 0.23506901735684024,
|
|
"grad_norm": 0.38616251833245985,
|
|
"learning_rate": 4.549457517260814e-06,
|
|
"loss": 1.7883,
|
|
"mean_token_accuracy": 0.637967973947525,
|
|
"num_tokens": 159331884.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 1.79921875,
|
|
"epoch": 0.23643569769031023,
|
|
"grad_norm": 0.17109736565336983,
|
|
"learning_rate": 4.545934902071298e-06,
|
|
"loss": 1.8112,
|
|
"mean_token_accuracy": 0.6345141649246215,
|
|
"num_tokens": 160307519.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 1.83984375,
|
|
"epoch": 0.23780237802378024,
|
|
"grad_norm": 0.23172911605062607,
|
|
"learning_rate": 4.5424122868817814e-06,
|
|
"loss": 1.8455,
|
|
"mean_token_accuracy": 0.6285134494304657,
|
|
"num_tokens": 161200053.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 1.74765625,
|
|
"epoch": 0.23916905835725025,
|
|
"grad_norm": 0.30582191554311294,
|
|
"learning_rate": 4.538889671692265e-06,
|
|
"loss": 1.7546,
|
|
"mean_token_accuracy": 0.6441819250583649,
|
|
"num_tokens": 162102233.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 1.846875,
|
|
"epoch": 0.24053573869072023,
|
|
"grad_norm": 0.21498812793962274,
|
|
"learning_rate": 4.535367056502748e-06,
|
|
"loss": 1.8542,
|
|
"mean_token_accuracy": 0.6313648879528045,
|
|
"num_tokens": 163056620.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 1.834375,
|
|
"epoch": 0.24190241902419024,
|
|
"grad_norm": 0.26693697970563435,
|
|
"learning_rate": 4.531844441313231e-06,
|
|
"loss": 1.8274,
|
|
"mean_token_accuracy": 0.6310067892074585,
|
|
"num_tokens": 163978099.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 1.75390625,
|
|
"epoch": 0.24326909935766025,
|
|
"grad_norm": 0.19853910888504792,
|
|
"learning_rate": 4.528321826123715e-06,
|
|
"loss": 1.7687,
|
|
"mean_token_accuracy": 0.6419982075691223,
|
|
"num_tokens": 164881935.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 1.82109375,
|
|
"epoch": 0.24463577969113023,
|
|
"grad_norm": 0.3077421353206064,
|
|
"learning_rate": 4.524799210934198e-06,
|
|
"loss": 1.8247,
|
|
"mean_token_accuracy": 0.6327477514743804,
|
|
"num_tokens": 165810181.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 1.7953125,
|
|
"epoch": 0.24600246002460024,
|
|
"grad_norm": 0.21808311014807585,
|
|
"learning_rate": 4.521276595744681e-06,
|
|
"loss": 1.7798,
|
|
"mean_token_accuracy": 0.6393623292446137,
|
|
"num_tokens": 166703645.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 1.8109375,
|
|
"epoch": 0.24736914035807026,
|
|
"grad_norm": 0.3916597678822117,
|
|
"learning_rate": 4.5177539805551646e-06,
|
|
"loss": 1.8259,
|
|
"mean_token_accuracy": 0.6313347160816193,
|
|
"num_tokens": 167621325.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 1.765625,
|
|
"epoch": 0.24873582069154024,
|
|
"grad_norm": 0.28941824873142585,
|
|
"learning_rate": 4.514231365365648e-06,
|
|
"loss": 1.7689,
|
|
"mean_token_accuracy": 0.6405619025230408,
|
|
"num_tokens": 168509372.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 1.78671875,
|
|
"epoch": 0.25010250102501025,
|
|
"grad_norm": 0.27590923208716805,
|
|
"learning_rate": 4.510708750176131e-06,
|
|
"loss": 1.781,
|
|
"mean_token_accuracy": 0.6374352931976318,
|
|
"num_tokens": 169409449.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 1.778125,
|
|
"epoch": 0.25146918135848023,
|
|
"grad_norm": 0.3017879897916132,
|
|
"learning_rate": 4.507186134986614e-06,
|
|
"loss": 1.7747,
|
|
"mean_token_accuracy": 0.6397411584854126,
|
|
"num_tokens": 170375069.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 1.83515625,
|
|
"epoch": 0.25283586169195027,
|
|
"grad_norm": 0.37990959345188435,
|
|
"learning_rate": 4.503663519797098e-06,
|
|
"loss": 1.8398,
|
|
"mean_token_accuracy": 0.6291960597038269,
|
|
"num_tokens": 171300299.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 1.79296875,
|
|
"epoch": 0.25420254202542025,
|
|
"grad_norm": 0.2644804666947377,
|
|
"learning_rate": 4.5001409046075814e-06,
|
|
"loss": 1.7927,
|
|
"mean_token_accuracy": 0.6403003752231597,
|
|
"num_tokens": 172222664.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 1.790625,
|
|
"epoch": 0.25556922235889024,
|
|
"grad_norm": 0.21010144399599895,
|
|
"learning_rate": 4.496618289418064e-06,
|
|
"loss": 1.793,
|
|
"mean_token_accuracy": 0.6390233755111694,
|
|
"num_tokens": 173173238.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 1.78515625,
|
|
"epoch": 0.2569359026923603,
|
|
"grad_norm": 0.2598222462804347,
|
|
"learning_rate": 4.493095674228548e-06,
|
|
"loss": 1.7852,
|
|
"mean_token_accuracy": 0.640588515996933,
|
|
"num_tokens": 174072727.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 1.84453125,
|
|
"epoch": 0.25830258302583026,
|
|
"grad_norm": 0.27635829811769214,
|
|
"learning_rate": 4.48957305903903e-06,
|
|
"loss": 1.8371,
|
|
"mean_token_accuracy": 0.628589141368866,
|
|
"num_tokens": 175009745.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 1.77109375,
|
|
"epoch": 0.25966926335930024,
|
|
"grad_norm": 0.2855684821774331,
|
|
"learning_rate": 4.486050443849515e-06,
|
|
"loss": 1.7785,
|
|
"mean_token_accuracy": 0.641683554649353,
|
|
"num_tokens": 175926311.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 1.77734375,
|
|
"epoch": 0.2610359436927703,
|
|
"grad_norm": 0.2117805419523993,
|
|
"learning_rate": 4.4825278286599975e-06,
|
|
"loss": 1.7833,
|
|
"mean_token_accuracy": 0.6380048453807831,
|
|
"num_tokens": 176859810.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 0.26240262402624026,
|
|
"grad_norm": 0.2141843794594821,
|
|
"learning_rate": 4.479005213470481e-06,
|
|
"loss": 1.7317,
|
|
"mean_token_accuracy": 0.6470535278320313,
|
|
"num_tokens": 177776492.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 1.83125,
|
|
"epoch": 0.26376930435971024,
|
|
"grad_norm": 0.44924722973156617,
|
|
"learning_rate": 4.475482598280964e-06,
|
|
"loss": 1.8215,
|
|
"mean_token_accuracy": 0.6300243437290192,
|
|
"num_tokens": 178696196.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 1.76015625,
|
|
"epoch": 0.2651359846931803,
|
|
"grad_norm": 0.21347909972772133,
|
|
"learning_rate": 4.471959983091447e-06,
|
|
"loss": 1.7706,
|
|
"mean_token_accuracy": 0.6405327439308166,
|
|
"num_tokens": 179623040.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 1.79375,
|
|
"epoch": 0.26650266502665027,
|
|
"grad_norm": 0.22280705338060908,
|
|
"learning_rate": 4.468437367901931e-06,
|
|
"loss": 1.8102,
|
|
"mean_token_accuracy": 0.6363321602344513,
|
|
"num_tokens": 180550652.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 1.8328125,
|
|
"epoch": 0.26786934536012025,
|
|
"grad_norm": 0.30187246579610955,
|
|
"learning_rate": 4.464914752712414e-06,
|
|
"loss": 1.8291,
|
|
"mean_token_accuracy": 0.6313114941120148,
|
|
"num_tokens": 181457910.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 1.76953125,
|
|
"epoch": 0.2692360256935903,
|
|
"grad_norm": 0.3400086157703264,
|
|
"learning_rate": 4.461392137522897e-06,
|
|
"loss": 1.7687,
|
|
"mean_token_accuracy": 0.6392099559307098,
|
|
"num_tokens": 182393588.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 1.72421875,
|
|
"epoch": 0.27060270602706027,
|
|
"grad_norm": 0.2365433558061304,
|
|
"learning_rate": 4.457869522333381e-06,
|
|
"loss": 1.7268,
|
|
"mean_token_accuracy": 0.6463927209377289,
|
|
"num_tokens": 183291143.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 1.7484375,
|
|
"epoch": 0.27196938636053025,
|
|
"grad_norm": 0.28935598581358796,
|
|
"learning_rate": 4.454346907143864e-06,
|
|
"loss": 1.7552,
|
|
"mean_token_accuracy": 0.6444904029369354,
|
|
"num_tokens": 184205657.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 1.8234375,
|
|
"epoch": 0.2733360666940003,
|
|
"grad_norm": 0.22968981096255223,
|
|
"learning_rate": 4.450824291954347e-06,
|
|
"loss": 1.8317,
|
|
"mean_token_accuracy": 0.6295570313930512,
|
|
"num_tokens": 185143664.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 1.73203125,
|
|
"epoch": 0.2747027470274703,
|
|
"grad_norm": 0.41396860653220624,
|
|
"learning_rate": 4.44730167676483e-06,
|
|
"loss": 1.7471,
|
|
"mean_token_accuracy": 0.6449798703193664,
|
|
"num_tokens": 186076509.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 1.8640625,
|
|
"epoch": 0.27606942736094026,
|
|
"grad_norm": 0.26731279591713003,
|
|
"learning_rate": 4.443779061575314e-06,
|
|
"loss": 1.8753,
|
|
"mean_token_accuracy": 0.6255852401256561,
|
|
"num_tokens": 186997437.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.2774361076944103,
|
|
"grad_norm": 0.36308653884425646,
|
|
"learning_rate": 4.4402564463857975e-06,
|
|
"loss": 1.7035,
|
|
"mean_token_accuracy": 0.6462227046489716,
|
|
"num_tokens": 187883347.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 1.80390625,
|
|
"epoch": 0.2788027880278803,
|
|
"grad_norm": 0.25986365142363216,
|
|
"learning_rate": 4.43673383119628e-06,
|
|
"loss": 1.8501,
|
|
"mean_token_accuracy": 0.6285334825515747,
|
|
"num_tokens": 188783982.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 1.80234375,
|
|
"epoch": 0.28016946836135026,
|
|
"grad_norm": 0.3055677279020864,
|
|
"learning_rate": 4.433211216006764e-06,
|
|
"loss": 1.822,
|
|
"mean_token_accuracy": 0.6341411352157593,
|
|
"num_tokens": 189673102.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 0.2815361486948203,
|
|
"grad_norm": 0.2524364769319253,
|
|
"learning_rate": 4.4296886008172465e-06,
|
|
"loss": 1.7306,
|
|
"mean_token_accuracy": 0.6445607244968414,
|
|
"num_tokens": 190584040.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 1.7515625,
|
|
"epoch": 0.2829028290282903,
|
|
"grad_norm": 0.18067880349252732,
|
|
"learning_rate": 4.426165985627731e-06,
|
|
"loss": 1.7658,
|
|
"mean_token_accuracy": 0.6417096495628357,
|
|
"num_tokens": 191547584.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 1.7140625,
|
|
"epoch": 0.28426950936176026,
|
|
"grad_norm": 0.2122302693014469,
|
|
"learning_rate": 4.4226433704382136e-06,
|
|
"loss": 1.7031,
|
|
"mean_token_accuracy": 0.6527187168598175,
|
|
"num_tokens": 192439308.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 1.759375,
|
|
"epoch": 0.2856361896952303,
|
|
"grad_norm": 0.16537412322597908,
|
|
"learning_rate": 4.419120755248697e-06,
|
|
"loss": 1.7451,
|
|
"mean_token_accuracy": 0.6443639457225799,
|
|
"num_tokens": 193368864.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 1.8625,
|
|
"epoch": 0.2870028700287003,
|
|
"grad_norm": 0.4195695871704014,
|
|
"learning_rate": 4.41559814005918e-06,
|
|
"loss": 1.8891,
|
|
"mean_token_accuracy": 0.6229565560817718,
|
|
"num_tokens": 194306360.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 1.7546875,
|
|
"epoch": 0.28836955036217027,
|
|
"grad_norm": 0.2658993374592815,
|
|
"learning_rate": 4.412075524869663e-06,
|
|
"loss": 1.7501,
|
|
"mean_token_accuracy": 0.6454492270946502,
|
|
"num_tokens": 195184891.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 1.78671875,
|
|
"epoch": 0.2897362306956403,
|
|
"grad_norm": 0.2753500238614729,
|
|
"learning_rate": 4.408552909680147e-06,
|
|
"loss": 1.783,
|
|
"mean_token_accuracy": 0.6405049085617065,
|
|
"num_tokens": 196099408.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 1.803125,
|
|
"epoch": 0.2911029110291103,
|
|
"grad_norm": 0.1968025121487427,
|
|
"learning_rate": 4.4050302944906304e-06,
|
|
"loss": 1.8117,
|
|
"mean_token_accuracy": 0.6345217287540436,
|
|
"num_tokens": 197036417.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 1.80234375,
|
|
"epoch": 0.2924695913625803,
|
|
"grad_norm": 0.24495703630858623,
|
|
"learning_rate": 4.401507679301113e-06,
|
|
"loss": 1.783,
|
|
"mean_token_accuracy": 0.6373064815998077,
|
|
"num_tokens": 197925774.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 1.88671875,
|
|
"epoch": 0.2938362716960503,
|
|
"grad_norm": 0.20348996063636954,
|
|
"learning_rate": 4.397985064111597e-06,
|
|
"loss": 1.9019,
|
|
"mean_token_accuracy": 0.6210007786750793,
|
|
"num_tokens": 198884434.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 1.778125,
|
|
"epoch": 0.2952029520295203,
|
|
"grad_norm": 0.2156280749746983,
|
|
"learning_rate": 4.39446244892208e-06,
|
|
"loss": 1.7797,
|
|
"mean_token_accuracy": 0.6375832319259643,
|
|
"num_tokens": 199808572.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 1.825,
|
|
"epoch": 0.2965696323629903,
|
|
"grad_norm": 0.22562362185436255,
|
|
"learning_rate": 4.390939833732563e-06,
|
|
"loss": 1.8269,
|
|
"mean_token_accuracy": 0.6344954013824463,
|
|
"num_tokens": 200657377.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 1.75390625,
|
|
"epoch": 0.2979363126964603,
|
|
"grad_norm": 0.20015816286400592,
|
|
"learning_rate": 4.387417218543047e-06,
|
|
"loss": 1.7629,
|
|
"mean_token_accuracy": 0.6410900354385376,
|
|
"num_tokens": 201592453.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.2993029930299303,
|
|
"grad_norm": 0.2663643063820271,
|
|
"learning_rate": 4.38389460335353e-06,
|
|
"loss": 1.6755,
|
|
"mean_token_accuracy": 0.6533253073692322,
|
|
"num_tokens": 202508489.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 1.803125,
|
|
"epoch": 0.3006696733634003,
|
|
"grad_norm": 0.3414517770258873,
|
|
"learning_rate": 4.380371988164014e-06,
|
|
"loss": 1.8137,
|
|
"mean_token_accuracy": 0.631429272890091,
|
|
"num_tokens": 203486955.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 1.7625,
|
|
"epoch": 0.3020363536968703,
|
|
"grad_norm": 0.23582171838872357,
|
|
"learning_rate": 4.376849372974496e-06,
|
|
"loss": 1.7618,
|
|
"mean_token_accuracy": 0.6442265331745147,
|
|
"num_tokens": 204437915.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.3034030340303403,
|
|
"grad_norm": 0.21028195437538955,
|
|
"learning_rate": 4.37332675778498e-06,
|
|
"loss": 1.7513,
|
|
"mean_token_accuracy": 0.6402509152889252,
|
|
"num_tokens": 205369100.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 1.73671875,
|
|
"epoch": 0.3047697143638103,
|
|
"grad_norm": 0.9186156664144405,
|
|
"learning_rate": 4.369804142595463e-06,
|
|
"loss": 1.7571,
|
|
"mean_token_accuracy": 0.6420955896377564,
|
|
"num_tokens": 206283225.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 1.7625,
|
|
"epoch": 0.3061363946972803,
|
|
"grad_norm": 0.17346842413316382,
|
|
"learning_rate": 4.366281527405947e-06,
|
|
"loss": 1.7621,
|
|
"mean_token_accuracy": 0.6407465636730194,
|
|
"num_tokens": 207244837.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 1.75625,
|
|
"epoch": 0.3075030750307503,
|
|
"grad_norm": 0.21831471277042885,
|
|
"learning_rate": 4.36275891221643e-06,
|
|
"loss": 1.7552,
|
|
"mean_token_accuracy": 0.6406942784786225,
|
|
"num_tokens": 208176679.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 0.3088697553642203,
|
|
"grad_norm": 0.30927033395246173,
|
|
"learning_rate": 4.359236297026913e-06,
|
|
"loss": 1.7253,
|
|
"mean_token_accuracy": 0.6462858498096467,
|
|
"num_tokens": 209087973.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.3102364356976903,
|
|
"grad_norm": 0.31385059532847914,
|
|
"learning_rate": 4.355713681837396e-06,
|
|
"loss": 1.7007,
|
|
"mean_token_accuracy": 0.6492085099220276,
|
|
"num_tokens": 210003196.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 1.7421875,
|
|
"epoch": 0.3116031160311603,
|
|
"grad_norm": 0.4199225271735183,
|
|
"learning_rate": 4.352191066647879e-06,
|
|
"loss": 1.7552,
|
|
"mean_token_accuracy": 0.6415762841701508,
|
|
"num_tokens": 210911918.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.3129697963646303,
|
|
"grad_norm": 0.21480485353550383,
|
|
"learning_rate": 4.348668451458363e-06,
|
|
"loss": 1.7026,
|
|
"mean_token_accuracy": 0.648815143108368,
|
|
"num_tokens": 211848750.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 1.86953125,
|
|
"epoch": 0.31433647669810033,
|
|
"grad_norm": 0.27477135267629577,
|
|
"learning_rate": 4.3451458362688465e-06,
|
|
"loss": 1.8595,
|
|
"mean_token_accuracy": 0.6265469193458557,
|
|
"num_tokens": 212762655.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 1.775,
|
|
"epoch": 0.3157031570315703,
|
|
"grad_norm": 0.20356721001745723,
|
|
"learning_rate": 4.34162322107933e-06,
|
|
"loss": 1.7669,
|
|
"mean_token_accuracy": 0.6414711654186249,
|
|
"num_tokens": 213665788.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 1.76328125,
|
|
"epoch": 0.3170698373650403,
|
|
"grad_norm": 0.2778429999967278,
|
|
"learning_rate": 4.338100605889813e-06,
|
|
"loss": 1.7591,
|
|
"mean_token_accuracy": 0.6420689761638642,
|
|
"num_tokens": 214555444.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.31843651769851034,
|
|
"grad_norm": 0.18655598510096708,
|
|
"learning_rate": 4.334577990700296e-06,
|
|
"loss": 1.6852,
|
|
"mean_token_accuracy": 0.6541097700595856,
|
|
"num_tokens": 215457634.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 1.7953125,
|
|
"epoch": 0.3198031980319803,
|
|
"grad_norm": 0.2641824054909146,
|
|
"learning_rate": 4.33105537551078e-06,
|
|
"loss": 1.8009,
|
|
"mean_token_accuracy": 0.6321450889110565,
|
|
"num_tokens": 216362597.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.3211698783654503,
|
|
"grad_norm": 0.23582804789535874,
|
|
"learning_rate": 4.327532760321263e-06,
|
|
"loss": 1.6878,
|
|
"mean_token_accuracy": 0.6549837708473205,
|
|
"num_tokens": 217247903.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 1.74140625,
|
|
"epoch": 0.32253655869892034,
|
|
"grad_norm": 0.35226035983857285,
|
|
"learning_rate": 4.324010145131746e-06,
|
|
"loss": 1.7455,
|
|
"mean_token_accuracy": 0.6436051845550537,
|
|
"num_tokens": 218159615.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 1.76640625,
|
|
"epoch": 0.3239032390323903,
|
|
"grad_norm": 0.24275176899001183,
|
|
"learning_rate": 4.32048752994223e-06,
|
|
"loss": 1.7914,
|
|
"mean_token_accuracy": 0.6379481673240661,
|
|
"num_tokens": 219122838.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 1.7625,
|
|
"epoch": 0.3252699193658603,
|
|
"grad_norm": 0.2874867011421546,
|
|
"learning_rate": 4.316964914752712e-06,
|
|
"loss": 1.7688,
|
|
"mean_token_accuracy": 0.640893405675888,
|
|
"num_tokens": 220049077.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 1.7734375,
|
|
"epoch": 0.32663659969933034,
|
|
"grad_norm": 0.284804466701777,
|
|
"learning_rate": 4.313442299563196e-06,
|
|
"loss": 1.7767,
|
|
"mean_token_accuracy": 0.6414421677589417,
|
|
"num_tokens": 220929925.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 1.72890625,
|
|
"epoch": 0.3280032800328003,
|
|
"grad_norm": 0.20550304224382432,
|
|
"learning_rate": 4.3099196843736794e-06,
|
|
"loss": 1.7416,
|
|
"mean_token_accuracy": 0.6435724139213562,
|
|
"num_tokens": 221821571.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.3293699603662703,
|
|
"grad_norm": 0.22708451182350112,
|
|
"learning_rate": 4.306397069184163e-06,
|
|
"loss": 1.7524,
|
|
"mean_token_accuracy": 0.6413148760795593,
|
|
"num_tokens": 222737027.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 1.771875,
|
|
"epoch": 0.33073664069974035,
|
|
"grad_norm": 0.2662716989366884,
|
|
"learning_rate": 4.302874453994646e-06,
|
|
"loss": 1.7669,
|
|
"mean_token_accuracy": 0.6376565515995025,
|
|
"num_tokens": 223642969.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.33210332103321033,
|
|
"grad_norm": 0.3763972913509147,
|
|
"learning_rate": 4.299351838805129e-06,
|
|
"loss": 1.7019,
|
|
"mean_token_accuracy": 0.6491401612758636,
|
|
"num_tokens": 224570677.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.3334700013666803,
|
|
"grad_norm": 0.318650153543206,
|
|
"learning_rate": 4.295829223615613e-06,
|
|
"loss": 1.7511,
|
|
"mean_token_accuracy": 0.6425083398818969,
|
|
"num_tokens": 225506234.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 1.80234375,
|
|
"epoch": 0.33483668170015035,
|
|
"grad_norm": 0.2076955626160203,
|
|
"learning_rate": 4.292306608426096e-06,
|
|
"loss": 1.7852,
|
|
"mean_token_accuracy": 0.6393538236618042,
|
|
"num_tokens": 226416309.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.33620336203362033,
|
|
"grad_norm": 0.2242041338032885,
|
|
"learning_rate": 4.288783993236579e-06,
|
|
"loss": 1.6782,
|
|
"mean_token_accuracy": 0.6534141957759857,
|
|
"num_tokens": 227353374.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 1.76796875,
|
|
"epoch": 0.3375700423670903,
|
|
"grad_norm": 0.18910284322569226,
|
|
"learning_rate": 4.285261378047063e-06,
|
|
"loss": 1.7501,
|
|
"mean_token_accuracy": 0.6431631207466125,
|
|
"num_tokens": 228287374.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 1.7859375,
|
|
"epoch": 0.33893672270056036,
|
|
"grad_norm": 0.41260321848528253,
|
|
"learning_rate": 4.281738762857546e-06,
|
|
"loss": 1.7955,
|
|
"mean_token_accuracy": 0.6328104197978973,
|
|
"num_tokens": 229217134.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 1.81953125,
|
|
"epoch": 0.34030340303403034,
|
|
"grad_norm": 0.1760344258757106,
|
|
"learning_rate": 4.278216147668029e-06,
|
|
"loss": 1.8245,
|
|
"mean_token_accuracy": 0.6310071170330047,
|
|
"num_tokens": 230150527.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 1.7578125,
|
|
"epoch": 0.3416700833675003,
|
|
"grad_norm": 0.430314684284489,
|
|
"learning_rate": 4.274693532478512e-06,
|
|
"loss": 1.7633,
|
|
"mean_token_accuracy": 0.6391462564468384,
|
|
"num_tokens": 231075546.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.34303676370097036,
|
|
"grad_norm": 0.21718415954371245,
|
|
"learning_rate": 4.271170917288996e-06,
|
|
"loss": 1.6912,
|
|
"mean_token_accuracy": 0.6530725240707398,
|
|
"num_tokens": 231990330.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 1.71796875,
|
|
"epoch": 0.34440344403444034,
|
|
"grad_norm": 0.36884052435457454,
|
|
"learning_rate": 4.2676483020994795e-06,
|
|
"loss": 1.7047,
|
|
"mean_token_accuracy": 0.6506029188632965,
|
|
"num_tokens": 232926675.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 1.79921875,
|
|
"epoch": 0.3457701243679103,
|
|
"grad_norm": 0.1773042084731685,
|
|
"learning_rate": 4.264125686909962e-06,
|
|
"loss": 1.8192,
|
|
"mean_token_accuracy": 0.6336402416229248,
|
|
"num_tokens": 233888353.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 1.76875,
|
|
"epoch": 0.34713680470138036,
|
|
"grad_norm": 0.32626463200036626,
|
|
"learning_rate": 4.260603071720446e-06,
|
|
"loss": 1.7794,
|
|
"mean_token_accuracy": 0.6401168227195739,
|
|
"num_tokens": 234794291.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 1.7859375,
|
|
"epoch": 0.34850348503485035,
|
|
"grad_norm": 0.25200311534196973,
|
|
"learning_rate": 4.257080456530928e-06,
|
|
"loss": 1.7872,
|
|
"mean_token_accuracy": 0.637760853767395,
|
|
"num_tokens": 235717981.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.34987016536832033,
|
|
"grad_norm": 0.2806378284118828,
|
|
"learning_rate": 4.253557841341413e-06,
|
|
"loss": 1.682,
|
|
"mean_token_accuracy": 0.6519226908683777,
|
|
"num_tokens": 236653950.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.35123684570179037,
|
|
"grad_norm": 0.4089435482976592,
|
|
"learning_rate": 4.2500352261518955e-06,
|
|
"loss": 1.7471,
|
|
"mean_token_accuracy": 0.6421530604362488,
|
|
"num_tokens": 237581166.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.35260352603526035,
|
|
"grad_norm": 0.267224529010269,
|
|
"learning_rate": 4.246512610962379e-06,
|
|
"loss": 1.6694,
|
|
"mean_token_accuracy": 0.6556879460811615,
|
|
"num_tokens": 238506054.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 0.35397020636873033,
|
|
"grad_norm": 0.4030232455305336,
|
|
"learning_rate": 4.242989995772862e-06,
|
|
"loss": 1.7411,
|
|
"mean_token_accuracy": 0.6458200216293335,
|
|
"num_tokens": 239394183.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 1.80234375,
|
|
"epoch": 0.35533688670220037,
|
|
"grad_norm": 0.32655127488452307,
|
|
"learning_rate": 4.239467380583345e-06,
|
|
"loss": 1.8017,
|
|
"mean_token_accuracy": 0.6344713389873504,
|
|
"num_tokens": 240365958.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.35670356703567035,
|
|
"grad_norm": 0.1991547958031096,
|
|
"learning_rate": 4.235944765393829e-06,
|
|
"loss": 1.6902,
|
|
"mean_token_accuracy": 0.6513091862201691,
|
|
"num_tokens": 241282122.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 1.7953125,
|
|
"epoch": 0.35807024736914034,
|
|
"grad_norm": 0.18307088077760097,
|
|
"learning_rate": 4.232422150204312e-06,
|
|
"loss": 1.8046,
|
|
"mean_token_accuracy": 0.6330149710178375,
|
|
"num_tokens": 242168669.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 1.84375,
|
|
"epoch": 0.3594369277026104,
|
|
"grad_norm": 0.20754327104358591,
|
|
"learning_rate": 4.228899535014795e-06,
|
|
"loss": 1.8471,
|
|
"mean_token_accuracy": 0.6264434337615967,
|
|
"num_tokens": 243057076.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 1.75234375,
|
|
"epoch": 0.36080360803608036,
|
|
"grad_norm": 0.5058734185167553,
|
|
"learning_rate": 4.225376919825279e-06,
|
|
"loss": 1.7562,
|
|
"mean_token_accuracy": 0.6400220453739166,
|
|
"num_tokens": 243972568.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 1.76015625,
|
|
"epoch": 0.36217028836955034,
|
|
"grad_norm": 0.2962985504981598,
|
|
"learning_rate": 4.221854304635762e-06,
|
|
"loss": 1.7672,
|
|
"mean_token_accuracy": 0.6403412222862244,
|
|
"num_tokens": 244888393.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 1.81484375,
|
|
"epoch": 0.3635369687030204,
|
|
"grad_norm": 0.24983831545657326,
|
|
"learning_rate": 4.218331689446245e-06,
|
|
"loss": 1.829,
|
|
"mean_token_accuracy": 0.6303679823875428,
|
|
"num_tokens": 245817941.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.36490364903649036,
|
|
"grad_norm": 0.23546012524709053,
|
|
"learning_rate": 4.2148090742567284e-06,
|
|
"loss": 1.7029,
|
|
"mean_token_accuracy": 0.6497031867504119,
|
|
"num_tokens": 246732614.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 1.76953125,
|
|
"epoch": 0.36627032936996035,
|
|
"grad_norm": 0.17476822836987524,
|
|
"learning_rate": 4.211286459067212e-06,
|
|
"loss": 1.8037,
|
|
"mean_token_accuracy": 0.6297558426856995,
|
|
"num_tokens": 247706776.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.3676370097034304,
|
|
"grad_norm": 0.34370364684105265,
|
|
"learning_rate": 4.2077638438776955e-06,
|
|
"loss": 1.6667,
|
|
"mean_token_accuracy": 0.6582704186439514,
|
|
"num_tokens": 248601078.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.36900369003690037,
|
|
"grad_norm": 0.20711975856737994,
|
|
"learning_rate": 4.204241228688178e-06,
|
|
"loss": 1.7124,
|
|
"mean_token_accuracy": 0.6498324930667877,
|
|
"num_tokens": 249607915.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.37037037037037035,
|
|
"grad_norm": 0.19412742688878837,
|
|
"learning_rate": 4.200718613498662e-06,
|
|
"loss": 1.6924,
|
|
"mean_token_accuracy": 0.6516908645629883,
|
|
"num_tokens": 250541333.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 1.75546875,
|
|
"epoch": 0.3717370507038404,
|
|
"grad_norm": 0.31432062708327774,
|
|
"learning_rate": 4.1971959983091445e-06,
|
|
"loss": 1.7555,
|
|
"mean_token_accuracy": 0.6435626208782196,
|
|
"num_tokens": 251444577.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.37310373103731037,
|
|
"grad_norm": 0.40786389472663165,
|
|
"learning_rate": 4.193673383119629e-06,
|
|
"loss": 1.6775,
|
|
"mean_token_accuracy": 0.6522301316261292,
|
|
"num_tokens": 252393534.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 1.85859375,
|
|
"epoch": 0.37447041137078035,
|
|
"grad_norm": 0.2794446338481059,
|
|
"learning_rate": 4.190150767930112e-06,
|
|
"loss": 1.8638,
|
|
"mean_token_accuracy": 0.626981669664383,
|
|
"num_tokens": 253385809.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 1.7890625,
|
|
"epoch": 0.3758370917042504,
|
|
"grad_norm": 0.2478129732121643,
|
|
"learning_rate": 4.186628152740595e-06,
|
|
"loss": 1.7944,
|
|
"mean_token_accuracy": 0.6365641117095947,
|
|
"num_tokens": 254323556.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 1.759375,
|
|
"epoch": 0.3772037720377204,
|
|
"grad_norm": 0.2891164795476671,
|
|
"learning_rate": 4.183105537551078e-06,
|
|
"loss": 1.7532,
|
|
"mean_token_accuracy": 0.6407860517501831,
|
|
"num_tokens": 255248502.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 1.8203125,
|
|
"epoch": 0.37857045237119036,
|
|
"grad_norm": 0.21276166073212294,
|
|
"learning_rate": 4.179582922361561e-06,
|
|
"loss": 1.8319,
|
|
"mean_token_accuracy": 0.6292681038379669,
|
|
"num_tokens": 256139840.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 1.7921875,
|
|
"epoch": 0.3799371327046604,
|
|
"grad_norm": 0.17586733777626226,
|
|
"learning_rate": 4.176060307172045e-06,
|
|
"loss": 1.7925,
|
|
"mean_token_accuracy": 0.6359074473381042,
|
|
"num_tokens": 257089986.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.3813038130381304,
|
|
"grad_norm": 0.20089220768483543,
|
|
"learning_rate": 4.1725376919825285e-06,
|
|
"loss": 1.7179,
|
|
"mean_token_accuracy": 0.6477283477783203,
|
|
"num_tokens": 257982295.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.38267049337160036,
|
|
"grad_norm": 0.17164740687347588,
|
|
"learning_rate": 4.169015076793011e-06,
|
|
"loss": 1.6953,
|
|
"mean_token_accuracy": 0.6492377102375031,
|
|
"num_tokens": 258874402.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.3840371737050704,
|
|
"grad_norm": 0.2609824056137697,
|
|
"learning_rate": 4.165492461603495e-06,
|
|
"loss": 1.6911,
|
|
"mean_token_accuracy": 0.6491614639759063,
|
|
"num_tokens": 259811203.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 0.3854038540385404,
|
|
"grad_norm": 0.18427842734472016,
|
|
"learning_rate": 4.161969846413978e-06,
|
|
"loss": 1.6614,
|
|
"mean_token_accuracy": 0.6533154428005219,
|
|
"num_tokens": 260707238.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 0.38677053437201037,
|
|
"grad_norm": 0.22427616385320803,
|
|
"learning_rate": 4.158447231224461e-06,
|
|
"loss": 1.6881,
|
|
"mean_token_accuracy": 0.6512023150920868,
|
|
"num_tokens": 261685481.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 1.7671875,
|
|
"epoch": 0.3881372147054804,
|
|
"grad_norm": 0.35655740664242064,
|
|
"learning_rate": 4.1549246160349445e-06,
|
|
"loss": 1.7606,
|
|
"mean_token_accuracy": 0.6397002041339874,
|
|
"num_tokens": 262649936.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 1.790625,
|
|
"epoch": 0.3895038950389504,
|
|
"grad_norm": 0.28171655995687156,
|
|
"learning_rate": 4.151402000845428e-06,
|
|
"loss": 1.805,
|
|
"mean_token_accuracy": 0.6364287674427033,
|
|
"num_tokens": 263592442.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.39087057537242037,
|
|
"grad_norm": 0.23734074057071072,
|
|
"learning_rate": 4.147879385655912e-06,
|
|
"loss": 1.6514,
|
|
"mean_token_accuracy": 0.6579195320606231,
|
|
"num_tokens": 264512538.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 1.73515625,
|
|
"epoch": 0.3922372557058904,
|
|
"grad_norm": 0.2071398259790424,
|
|
"learning_rate": 4.144356770466394e-06,
|
|
"loss": 1.7238,
|
|
"mean_token_accuracy": 0.6439678370952606,
|
|
"num_tokens": 265400491.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 1.81171875,
|
|
"epoch": 0.3936039360393604,
|
|
"grad_norm": 0.23016550274061848,
|
|
"learning_rate": 4.140834155276878e-06,
|
|
"loss": 1.8106,
|
|
"mean_token_accuracy": 0.6318399310112,
|
|
"num_tokens": 266341603.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 1.73984375,
|
|
"epoch": 0.3949706163728304,
|
|
"grad_norm": 0.21641206613322975,
|
|
"learning_rate": 4.137311540087361e-06,
|
|
"loss": 1.7508,
|
|
"mean_token_accuracy": 0.6448728859424591,
|
|
"num_tokens": 267319884.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 0.3963372967063004,
|
|
"grad_norm": 0.2199411964756763,
|
|
"learning_rate": 4.133788924897845e-06,
|
|
"loss": 1.7574,
|
|
"mean_token_accuracy": 0.6412050604820252,
|
|
"num_tokens": 268252375.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.3977039770397704,
|
|
"grad_norm": 0.21363961861825698,
|
|
"learning_rate": 4.130266309708328e-06,
|
|
"loss": 1.6559,
|
|
"mean_token_accuracy": 0.6586819231510163,
|
|
"num_tokens": 269167901.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 1.7921875,
|
|
"epoch": 0.3990706573732404,
|
|
"grad_norm": 0.1917248413718354,
|
|
"learning_rate": 4.126743694518811e-06,
|
|
"loss": 1.7907,
|
|
"mean_token_accuracy": 0.6370440423488617,
|
|
"num_tokens": 270118365.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 1.7421875,
|
|
"epoch": 0.4004373377067104,
|
|
"grad_norm": 0.17645902475343986,
|
|
"learning_rate": 4.123221079329294e-06,
|
|
"loss": 1.7425,
|
|
"mean_token_accuracy": 0.6435442626476288,
|
|
"num_tokens": 270984741.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 1.81484375,
|
|
"epoch": 0.4018040180401804,
|
|
"grad_norm": 0.3164024208367383,
|
|
"learning_rate": 4.1196984641397774e-06,
|
|
"loss": 1.8119,
|
|
"mean_token_accuracy": 0.6330052673816681,
|
|
"num_tokens": 271888394.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.4031706983736504,
|
|
"grad_norm": 0.19357027044466213,
|
|
"learning_rate": 4.116175848950261e-06,
|
|
"loss": 1.6461,
|
|
"mean_token_accuracy": 0.660742563009262,
|
|
"num_tokens": 272802959.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 0.4045373787071204,
|
|
"grad_norm": 0.31550715545184377,
|
|
"learning_rate": 4.1126532337607445e-06,
|
|
"loss": 1.6867,
|
|
"mean_token_accuracy": 0.6516721725463868,
|
|
"num_tokens": 273652093.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 1.7859375,
|
|
"epoch": 0.4059040590405904,
|
|
"grad_norm": 0.37344283947120493,
|
|
"learning_rate": 4.109130618571227e-06,
|
|
"loss": 1.7938,
|
|
"mean_token_accuracy": 0.6364236891269683,
|
|
"num_tokens": 274592185.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 1.765625,
|
|
"epoch": 0.4072707393740604,
|
|
"grad_norm": 0.22813363368410516,
|
|
"learning_rate": 4.105608003381711e-06,
|
|
"loss": 1.7803,
|
|
"mean_token_accuracy": 0.6321838915348053,
|
|
"num_tokens": 275499542.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.4086374197075304,
|
|
"grad_norm": 0.19278343449284185,
|
|
"learning_rate": 4.102085388192194e-06,
|
|
"loss": 1.657,
|
|
"mean_token_accuracy": 0.6577662706375123,
|
|
"num_tokens": 276414366.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 1.790625,
|
|
"epoch": 0.4100041000410004,
|
|
"grad_norm": 0.2940031819571157,
|
|
"learning_rate": 4.098562773002678e-06,
|
|
"loss": 1.7808,
|
|
"mean_token_accuracy": 0.6345036745071411,
|
|
"num_tokens": 277314397.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.4113707803744704,
|
|
"grad_norm": 0.18555992942170593,
|
|
"learning_rate": 4.095040157813161e-06,
|
|
"loss": 1.7132,
|
|
"mean_token_accuracy": 0.6484518706798553,
|
|
"num_tokens": 278278555.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 1.76953125,
|
|
"epoch": 0.41273746070794043,
|
|
"grad_norm": 0.31666960079673373,
|
|
"learning_rate": 4.091517542623644e-06,
|
|
"loss": 1.7719,
|
|
"mean_token_accuracy": 0.6392997860908508,
|
|
"num_tokens": 279188195.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 1.78984375,
|
|
"epoch": 0.4141041410414104,
|
|
"grad_norm": 0.3174555145482765,
|
|
"learning_rate": 4.087994927434128e-06,
|
|
"loss": 1.8196,
|
|
"mean_token_accuracy": 0.6317044138908386,
|
|
"num_tokens": 280103802.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 1.765625,
|
|
"epoch": 0.4154708213748804,
|
|
"grad_norm": 0.15823059715062837,
|
|
"learning_rate": 4.08447231224461e-06,
|
|
"loss": 1.7739,
|
|
"mean_token_accuracy": 0.6376178324222564,
|
|
"num_tokens": 280995189.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.41683750170835043,
|
|
"grad_norm": 0.203569589424748,
|
|
"learning_rate": 4.080949697055094e-06,
|
|
"loss": 1.6792,
|
|
"mean_token_accuracy": 0.6526623606681824,
|
|
"num_tokens": 281875436.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.4182041820418204,
|
|
"grad_norm": 0.22961318043260978,
|
|
"learning_rate": 4.0774270818655775e-06,
|
|
"loss": 1.7082,
|
|
"mean_token_accuracy": 0.6502086639404296,
|
|
"num_tokens": 282821847.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 1.778125,
|
|
"epoch": 0.4195708623752904,
|
|
"grad_norm": 0.24322835726915612,
|
|
"learning_rate": 4.073904466676061e-06,
|
|
"loss": 1.7747,
|
|
"mean_token_accuracy": 0.635715800523758,
|
|
"num_tokens": 283723983.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.42093754270876044,
|
|
"grad_norm": 0.23694788941516173,
|
|
"learning_rate": 4.070381851486544e-06,
|
|
"loss": 1.7183,
|
|
"mean_token_accuracy": 0.6491757929325104,
|
|
"num_tokens": 284689934.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.4223042230422304,
|
|
"grad_norm": 0.25457703324251946,
|
|
"learning_rate": 4.066859236297027e-06,
|
|
"loss": 1.742,
|
|
"mean_token_accuracy": 0.6448338508605957,
|
|
"num_tokens": 285637581.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.4236709033757004,
|
|
"grad_norm": 0.20092219743224868,
|
|
"learning_rate": 4.06333662110751e-06,
|
|
"loss": 1.716,
|
|
"mean_token_accuracy": 0.6484358251094818,
|
|
"num_tokens": 286530061.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 1.81875,
|
|
"epoch": 0.42503758370917044,
|
|
"grad_norm": 0.1962349738071507,
|
|
"learning_rate": 4.059814005917994e-06,
|
|
"loss": 1.836,
|
|
"mean_token_accuracy": 0.6318033337593079,
|
|
"num_tokens": 287493792.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 1.76171875,
|
|
"epoch": 0.4264042640426404,
|
|
"grad_norm": 0.31223819409826664,
|
|
"learning_rate": 4.056291390728477e-06,
|
|
"loss": 1.7599,
|
|
"mean_token_accuracy": 0.6434646248817444,
|
|
"num_tokens": 288416181.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 1.7828125,
|
|
"epoch": 0.4277709443761104,
|
|
"grad_norm": 0.28179041460669607,
|
|
"learning_rate": 4.052768775538961e-06,
|
|
"loss": 1.7913,
|
|
"mean_token_accuracy": 0.6391430675983429,
|
|
"num_tokens": 289364059.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.42913762470958045,
|
|
"grad_norm": 0.3232566105759596,
|
|
"learning_rate": 4.049246160349444e-06,
|
|
"loss": 1.7123,
|
|
"mean_token_accuracy": 0.6490186154842377,
|
|
"num_tokens": 290322031.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.43050430504305043,
|
|
"grad_norm": 0.32182295745437867,
|
|
"learning_rate": 4.045723545159927e-06,
|
|
"loss": 1.6913,
|
|
"mean_token_accuracy": 0.6516251146793366,
|
|
"num_tokens": 291207891.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.4318709853765204,
|
|
"grad_norm": 0.3126800865153388,
|
|
"learning_rate": 4.04220092997041e-06,
|
|
"loss": 1.6662,
|
|
"mean_token_accuracy": 0.6565351009368896,
|
|
"num_tokens": 292155031.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 1.7234375,
|
|
"epoch": 0.43323766570999045,
|
|
"grad_norm": 0.2742447599384884,
|
|
"learning_rate": 4.038678314780894e-06,
|
|
"loss": 1.6967,
|
|
"mean_token_accuracy": 0.6514041244983673,
|
|
"num_tokens": 293128892.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.43460434604346043,
|
|
"grad_norm": 0.23567888261167305,
|
|
"learning_rate": 4.0351556995913775e-06,
|
|
"loss": 1.7278,
|
|
"mean_token_accuracy": 0.644339656829834,
|
|
"num_tokens": 294102287.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.4359710263769304,
|
|
"grad_norm": 0.22457042691586004,
|
|
"learning_rate": 4.03163308440186e-06,
|
|
"loss": 1.7278,
|
|
"mean_token_accuracy": 0.6451429665088654,
|
|
"num_tokens": 295011762.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.43733770671040045,
|
|
"grad_norm": 0.299808200562303,
|
|
"learning_rate": 4.028110469212344e-06,
|
|
"loss": 1.7196,
|
|
"mean_token_accuracy": 0.6503589510917663,
|
|
"num_tokens": 295896186.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.43870438704387044,
|
|
"grad_norm": 0.22075133935028748,
|
|
"learning_rate": 4.0245878540228264e-06,
|
|
"loss": 1.7325,
|
|
"mean_token_accuracy": 0.6458438396453857,
|
|
"num_tokens": 296809525.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.4400710673773404,
|
|
"grad_norm": 0.3382784287165396,
|
|
"learning_rate": 4.02106523883331e-06,
|
|
"loss": 1.7316,
|
|
"mean_token_accuracy": 0.6465146601200104,
|
|
"num_tokens": 297704160.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.44143774771081046,
|
|
"grad_norm": 0.2718977412162226,
|
|
"learning_rate": 4.0175426236437935e-06,
|
|
"loss": 1.6475,
|
|
"mean_token_accuracy": 0.6609158277511596,
|
|
"num_tokens": 298596359.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.44280442804428044,
|
|
"grad_norm": 0.317494708471766,
|
|
"learning_rate": 4.014020008454277e-06,
|
|
"loss": 1.6797,
|
|
"mean_token_accuracy": 0.6527561664581298,
|
|
"num_tokens": 299524843.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.4441711083777504,
|
|
"grad_norm": 0.1662907783106165,
|
|
"learning_rate": 4.01049739326476e-06,
|
|
"loss": 1.6856,
|
|
"mean_token_accuracy": 0.6513387322425842,
|
|
"num_tokens": 300391883.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 1.759375,
|
|
"epoch": 0.44553778871122046,
|
|
"grad_norm": 0.22401224094651517,
|
|
"learning_rate": 4.006974778075243e-06,
|
|
"loss": 1.7586,
|
|
"mean_token_accuracy": 0.642477935552597,
|
|
"num_tokens": 301330612.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 1.78125,
|
|
"epoch": 0.44690446904469044,
|
|
"grad_norm": 0.28295893303584996,
|
|
"learning_rate": 4.003452162885727e-06,
|
|
"loss": 1.7904,
|
|
"mean_token_accuracy": 0.6329545199871063,
|
|
"num_tokens": 302282719.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.4482711493781604,
|
|
"grad_norm": 0.2674894954699526,
|
|
"learning_rate": 3.99992954769621e-06,
|
|
"loss": 1.6984,
|
|
"mean_token_accuracy": 0.6506749629974365,
|
|
"num_tokens": 303205564.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.44963782971163047,
|
|
"grad_norm": 0.24303260027106968,
|
|
"learning_rate": 3.996406932506693e-06,
|
|
"loss": 1.7098,
|
|
"mean_token_accuracy": 0.6477024137973786,
|
|
"num_tokens": 304170131.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.45100451004510045,
|
|
"grad_norm": 0.2552127333756443,
|
|
"learning_rate": 3.992884317317177e-06,
|
|
"loss": 1.7342,
|
|
"mean_token_accuracy": 0.6459564805030823,
|
|
"num_tokens": 305140078.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 1.76796875,
|
|
"epoch": 0.45237119037857043,
|
|
"grad_norm": 0.4673591017920392,
|
|
"learning_rate": 3.98936170212766e-06,
|
|
"loss": 1.7892,
|
|
"mean_token_accuracy": 0.6377993583679199,
|
|
"num_tokens": 306091203.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.45373787071204047,
|
|
"grad_norm": 0.27527428102895357,
|
|
"learning_rate": 3.985839086938143e-06,
|
|
"loss": 1.7309,
|
|
"mean_token_accuracy": 0.6447494387626648,
|
|
"num_tokens": 307003639.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 1.75859375,
|
|
"epoch": 0.45510455104551045,
|
|
"grad_norm": 0.2032120596124749,
|
|
"learning_rate": 3.9823164717486265e-06,
|
|
"loss": 1.7723,
|
|
"mean_token_accuracy": 0.6391928553581238,
|
|
"num_tokens": 307967156.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 0.45647123137898044,
|
|
"grad_norm": 0.24721952486119533,
|
|
"learning_rate": 3.97879385655911e-06,
|
|
"loss": 1.7047,
|
|
"mean_token_accuracy": 0.6484906375408173,
|
|
"num_tokens": 308874027.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 1.81875,
|
|
"epoch": 0.4578379117124505,
|
|
"grad_norm": 0.20206312450600125,
|
|
"learning_rate": 3.9752712413695936e-06,
|
|
"loss": 1.819,
|
|
"mean_token_accuracy": 0.6326112866401672,
|
|
"num_tokens": 309787558.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 1.85,
|
|
"epoch": 0.45920459204592046,
|
|
"grad_norm": 0.16311629324752688,
|
|
"learning_rate": 3.971748626180076e-06,
|
|
"loss": 1.8536,
|
|
"mean_token_accuracy": 0.624925309419632,
|
|
"num_tokens": 310715373.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.46057127237939044,
|
|
"grad_norm": 0.2392053289534042,
|
|
"learning_rate": 3.96822601099056e-06,
|
|
"loss": 1.7399,
|
|
"mean_token_accuracy": 0.6430058181285858,
|
|
"num_tokens": 311674326.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.4619379527128605,
|
|
"grad_norm": 0.3389685780795785,
|
|
"learning_rate": 3.9647033958010425e-06,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.6456503629684448,
|
|
"num_tokens": 312578943.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.46330463304633046,
|
|
"grad_norm": 0.5219919338108684,
|
|
"learning_rate": 3.961180780611527e-06,
|
|
"loss": 1.7394,
|
|
"mean_token_accuracy": 0.6395533680915833,
|
|
"num_tokens": 313496112.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.46467131337980044,
|
|
"grad_norm": 0.2349984609221839,
|
|
"learning_rate": 3.95765816542201e-06,
|
|
"loss": 1.7335,
|
|
"mean_token_accuracy": 0.6459803640842438,
|
|
"num_tokens": 314472756.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.4660379937132705,
|
|
"grad_norm": 0.23256366498354158,
|
|
"learning_rate": 3.954135550232493e-06,
|
|
"loss": 1.7192,
|
|
"mean_token_accuracy": 0.6476417124271393,
|
|
"num_tokens": 315372023.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 1.7671875,
|
|
"epoch": 0.46740467404674046,
|
|
"grad_norm": 0.45126082009123436,
|
|
"learning_rate": 3.950612935042976e-06,
|
|
"loss": 1.7531,
|
|
"mean_token_accuracy": 0.6422662615776062,
|
|
"num_tokens": 316296666.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.46877135438021045,
|
|
"grad_norm": 0.20163471516683065,
|
|
"learning_rate": 3.947090319853459e-06,
|
|
"loss": 1.7046,
|
|
"mean_token_accuracy": 0.649307256937027,
|
|
"num_tokens": 317209258.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 1.77578125,
|
|
"epoch": 0.4701380347136805,
|
|
"grad_norm": 0.33959020236569004,
|
|
"learning_rate": 3.943567704663943e-06,
|
|
"loss": 1.764,
|
|
"mean_token_accuracy": 0.6403445303440094,
|
|
"num_tokens": 318111951.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 0.47150471504715047,
|
|
"grad_norm": 0.18416343224410683,
|
|
"learning_rate": 3.9400450894744265e-06,
|
|
"loss": 1.7059,
|
|
"mean_token_accuracy": 0.6486511826515198,
|
|
"num_tokens": 318994627.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 1.73984375,
|
|
"epoch": 0.47287139538062045,
|
|
"grad_norm": 0.20161458910673036,
|
|
"learning_rate": 3.936522474284909e-06,
|
|
"loss": 1.7683,
|
|
"mean_token_accuracy": 0.6392531514167785,
|
|
"num_tokens": 319941249.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 0.4742380757140905,
|
|
"grad_norm": 0.29065136599767394,
|
|
"learning_rate": 3.932999859095393e-06,
|
|
"loss": 1.7469,
|
|
"mean_token_accuracy": 0.6440402746200562,
|
|
"num_tokens": 320934764.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.4756047560475605,
|
|
"grad_norm": 0.261394912316309,
|
|
"learning_rate": 3.929477243905876e-06,
|
|
"loss": 1.6846,
|
|
"mean_token_accuracy": 0.6533670008182526,
|
|
"num_tokens": 321857548.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 1.753125,
|
|
"epoch": 0.47697143638103046,
|
|
"grad_norm": 0.27527076733695427,
|
|
"learning_rate": 3.925954628716359e-06,
|
|
"loss": 1.7419,
|
|
"mean_token_accuracy": 0.6452029824256897,
|
|
"num_tokens": 322775118.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.4783381167145005,
|
|
"grad_norm": 0.7755100198691318,
|
|
"learning_rate": 3.9224320135268425e-06,
|
|
"loss": 1.7228,
|
|
"mean_token_accuracy": 0.645637971162796,
|
|
"num_tokens": 323676639.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 1.75234375,
|
|
"epoch": 0.4797047970479705,
|
|
"grad_norm": 0.2739751313077443,
|
|
"learning_rate": 3.918909398337326e-06,
|
|
"loss": 1.7563,
|
|
"mean_token_accuracy": 0.6408751368522644,
|
|
"num_tokens": 324594286.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 1.70703125,
|
|
"epoch": 0.48107147738144046,
|
|
"grad_norm": 0.2203255639507906,
|
|
"learning_rate": 3.91538678314781e-06,
|
|
"loss": 1.7035,
|
|
"mean_token_accuracy": 0.6501042008399963,
|
|
"num_tokens": 325536845.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 1.76328125,
|
|
"epoch": 0.4824381577149105,
|
|
"grad_norm": 0.21713154975807264,
|
|
"learning_rate": 3.911864167958292e-06,
|
|
"loss": 1.7536,
|
|
"mean_token_accuracy": 0.6430023849010468,
|
|
"num_tokens": 326473013.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.4838048380483805,
|
|
"grad_norm": 0.18105298453939858,
|
|
"learning_rate": 3.908341552768776e-06,
|
|
"loss": 1.6882,
|
|
"mean_token_accuracy": 0.6515076696872711,
|
|
"num_tokens": 327397369.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.48517151838185046,
|
|
"grad_norm": 0.22373505757024392,
|
|
"learning_rate": 3.9048189375792586e-06,
|
|
"loss": 1.6964,
|
|
"mean_token_accuracy": 0.6472815036773681,
|
|
"num_tokens": 328289318.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.4865381987153205,
|
|
"grad_norm": 0.18356069346138854,
|
|
"learning_rate": 3.901296322389743e-06,
|
|
"loss": 1.7559,
|
|
"mean_token_accuracy": 0.6402955055236816,
|
|
"num_tokens": 329184488.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 1.75546875,
|
|
"epoch": 0.4879048790487905,
|
|
"grad_norm": 0.30463514727710617,
|
|
"learning_rate": 3.897773707200226e-06,
|
|
"loss": 1.746,
|
|
"mean_token_accuracy": 0.6413840234279633,
|
|
"num_tokens": 330093094.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 0.48927155938226047,
|
|
"grad_norm": 0.22221259780817773,
|
|
"learning_rate": 3.894251092010709e-06,
|
|
"loss": 1.7243,
|
|
"mean_token_accuracy": 0.6436603963375092,
|
|
"num_tokens": 331040583.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.4906382397157305,
|
|
"grad_norm": 0.31976208307870374,
|
|
"learning_rate": 3.890728476821192e-06,
|
|
"loss": 1.6692,
|
|
"mean_token_accuracy": 0.6557289659976959,
|
|
"num_tokens": 331938883.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 0.4920049200492005,
|
|
"grad_norm": 0.2773419504750461,
|
|
"learning_rate": 3.8872058616316755e-06,
|
|
"loss": 1.5898,
|
|
"mean_token_accuracy": 0.6689355254173279,
|
|
"num_tokens": 332830337.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.49337160038267047,
|
|
"grad_norm": 0.21222803422418113,
|
|
"learning_rate": 3.883683246442159e-06,
|
|
"loss": 1.6925,
|
|
"mean_token_accuracy": 0.6495516717433929,
|
|
"num_tokens": 333744571.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 1.7140625,
|
|
"epoch": 0.4947382807161405,
|
|
"grad_norm": 0.28971688605859036,
|
|
"learning_rate": 3.8801606312526426e-06,
|
|
"loss": 1.7096,
|
|
"mean_token_accuracy": 0.6480568468570709,
|
|
"num_tokens": 334714247.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.4961049610496105,
|
|
"grad_norm": 0.2428058086937818,
|
|
"learning_rate": 3.876638016063125e-06,
|
|
"loss": 1.6321,
|
|
"mean_token_accuracy": 0.6623587012290955,
|
|
"num_tokens": 335654043.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 0.4974716413830805,
|
|
"grad_norm": 0.21747280506818784,
|
|
"learning_rate": 3.873115400873609e-06,
|
|
"loss": 1.6679,
|
|
"mean_token_accuracy": 0.6548601031303406,
|
|
"num_tokens": 336536161.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 1.77265625,
|
|
"epoch": 0.4988383217165505,
|
|
"grad_norm": 0.2632091231833541,
|
|
"learning_rate": 3.869592785684092e-06,
|
|
"loss": 1.7712,
|
|
"mean_token_accuracy": 0.6412346720695495,
|
|
"num_tokens": 337464793.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.5002050020500205,
|
|
"grad_norm": 0.19589353044196484,
|
|
"learning_rate": 3.866070170494575e-06,
|
|
"loss": 1.6833,
|
|
"mean_token_accuracy": 0.6537026226520538,
|
|
"num_tokens": 338365127.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 1.7328125,
|
|
"epoch": 0.5015716823834905,
|
|
"grad_norm": 0.5019725222397052,
|
|
"learning_rate": 3.862547555305059e-06,
|
|
"loss": 1.7489,
|
|
"mean_token_accuracy": 0.6431100785732269,
|
|
"num_tokens": 339329655.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.5029383627169605,
|
|
"grad_norm": 0.4713149381083709,
|
|
"learning_rate": 3.859024940115542e-06,
|
|
"loss": 1.7417,
|
|
"mean_token_accuracy": 0.6434930503368378,
|
|
"num_tokens": 340262970.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.5043050430504306,
|
|
"grad_norm": 0.21003995731902983,
|
|
"learning_rate": 3.855502324926026e-06,
|
|
"loss": 1.6509,
|
|
"mean_token_accuracy": 0.6593895077705383,
|
|
"num_tokens": 341234175.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.5056717233839005,
|
|
"grad_norm": 0.19952429593010576,
|
|
"learning_rate": 3.851979709736508e-06,
|
|
"loss": 1.7159,
|
|
"mean_token_accuracy": 0.6493825256824494,
|
|
"num_tokens": 342141137.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.5070384037173705,
|
|
"grad_norm": 0.22177543091799465,
|
|
"learning_rate": 3.848457094546992e-06,
|
|
"loss": 1.6967,
|
|
"mean_token_accuracy": 0.652348917722702,
|
|
"num_tokens": 343060210.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.5084050840508405,
|
|
"grad_norm": 0.24628022474888386,
|
|
"learning_rate": 3.8449344793574755e-06,
|
|
"loss": 1.6836,
|
|
"mean_token_accuracy": 0.6538033306598663,
|
|
"num_tokens": 344016777.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 1.703125,
|
|
"epoch": 0.5097717643843105,
|
|
"grad_norm": 0.24881774925804537,
|
|
"learning_rate": 3.841411864167959e-06,
|
|
"loss": 1.71,
|
|
"mean_token_accuracy": 0.6485269129276275,
|
|
"num_tokens": 344927589.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 1.72890625,
|
|
"epoch": 0.5111384447177805,
|
|
"grad_norm": 0.5942847281835786,
|
|
"learning_rate": 3.837889248978442e-06,
|
|
"loss": 1.7443,
|
|
"mean_token_accuracy": 0.6416549265384675,
|
|
"num_tokens": 345834726.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.5125051250512506,
|
|
"grad_norm": 0.23659974625792446,
|
|
"learning_rate": 3.834366633788925e-06,
|
|
"loss": 1.6718,
|
|
"mean_token_accuracy": 0.6545387327671051,
|
|
"num_tokens": 346764539.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.5138718053847205,
|
|
"grad_norm": 0.2477999804512993,
|
|
"learning_rate": 3.830844018599408e-06,
|
|
"loss": 1.7136,
|
|
"mean_token_accuracy": 0.6461123585700989,
|
|
"num_tokens": 347629175.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 1.73125,
|
|
"epoch": 0.5152384857181905,
|
|
"grad_norm": 0.20113601715467977,
|
|
"learning_rate": 3.8273214034098915e-06,
|
|
"loss": 1.7259,
|
|
"mean_token_accuracy": 0.6462513327598571,
|
|
"num_tokens": 348554495.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 1.7953125,
|
|
"epoch": 0.5166051660516605,
|
|
"grad_norm": 0.28906056599341284,
|
|
"learning_rate": 3.823798788220375e-06,
|
|
"loss": 1.7915,
|
|
"mean_token_accuracy": 0.636573314666748,
|
|
"num_tokens": 349451252.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.5179718463851305,
|
|
"grad_norm": 0.34348919593165855,
|
|
"learning_rate": 3.820276173030859e-06,
|
|
"loss": 1.6879,
|
|
"mean_token_accuracy": 0.6539805948734283,
|
|
"num_tokens": 350354085.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.5193385267186005,
|
|
"grad_norm": 0.21883092616457825,
|
|
"learning_rate": 3.816753557841341e-06,
|
|
"loss": 1.7036,
|
|
"mean_token_accuracy": 0.6492615401744842,
|
|
"num_tokens": 351249652.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.5207052070520706,
|
|
"grad_norm": 0.2737460682191046,
|
|
"learning_rate": 3.813230942651825e-06,
|
|
"loss": 1.664,
|
|
"mean_token_accuracy": 0.654195636510849,
|
|
"num_tokens": 352120557.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 1.70078125,
|
|
"epoch": 0.5220718873855406,
|
|
"grad_norm": 0.4320600415465639,
|
|
"learning_rate": 3.809708327462308e-06,
|
|
"loss": 1.703,
|
|
"mean_token_accuracy": 0.6462425947189331,
|
|
"num_tokens": 353020320.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 1.7453125,
|
|
"epoch": 0.5234385677190105,
|
|
"grad_norm": 0.24791832180469212,
|
|
"learning_rate": 3.806185712272792e-06,
|
|
"loss": 1.7649,
|
|
"mean_token_accuracy": 0.6426183938980102,
|
|
"num_tokens": 353962098.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 1.78203125,
|
|
"epoch": 0.5248052480524805,
|
|
"grad_norm": 0.2735584458911381,
|
|
"learning_rate": 3.802663097083275e-06,
|
|
"loss": 1.7914,
|
|
"mean_token_accuracy": 0.6335326790809631,
|
|
"num_tokens": 354830901.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 1.73203125,
|
|
"epoch": 0.5261719283859505,
|
|
"grad_norm": 0.21640291583224616,
|
|
"learning_rate": 3.799140481893758e-06,
|
|
"loss": 1.732,
|
|
"mean_token_accuracy": 0.6455218732357025,
|
|
"num_tokens": 355723832.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.5275386087194205,
|
|
"grad_norm": 0.190345828948229,
|
|
"learning_rate": 3.7956178667042413e-06,
|
|
"loss": 1.6699,
|
|
"mean_token_accuracy": 0.6571952342987061,
|
|
"num_tokens": 356608380.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 0.5289052890528906,
|
|
"grad_norm": 0.18297450803661872,
|
|
"learning_rate": 3.792095251514725e-06,
|
|
"loss": 1.6103,
|
|
"mean_token_accuracy": 0.6614236891269684,
|
|
"num_tokens": 357544747.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.5302719693863606,
|
|
"grad_norm": 0.2123179762932747,
|
|
"learning_rate": 3.788572636325208e-06,
|
|
"loss": 1.6908,
|
|
"mean_token_accuracy": 0.6551902711391449,
|
|
"num_tokens": 358509204.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 0.5316386497198305,
|
|
"grad_norm": 0.2568340404250557,
|
|
"learning_rate": 3.7850500211356916e-06,
|
|
"loss": 1.6215,
|
|
"mean_token_accuracy": 0.6613417625427246,
|
|
"num_tokens": 359444874.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 1.72421875,
|
|
"epoch": 0.5330053300533005,
|
|
"grad_norm": 0.2254100311331544,
|
|
"learning_rate": 3.7815274059461747e-06,
|
|
"loss": 1.7238,
|
|
"mean_token_accuracy": 0.6467018246650695,
|
|
"num_tokens": 360362774.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.5343720103867705,
|
|
"grad_norm": 0.16354185475314606,
|
|
"learning_rate": 3.7780047907566582e-06,
|
|
"loss": 1.716,
|
|
"mean_token_accuracy": 0.6463874101638794,
|
|
"num_tokens": 361307525.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.5357386907202405,
|
|
"grad_norm": 0.15403614250266587,
|
|
"learning_rate": 3.7744821755671413e-06,
|
|
"loss": 1.6868,
|
|
"mean_token_accuracy": 0.6558946549892426,
|
|
"num_tokens": 362257037.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 0.5371053710537106,
|
|
"grad_norm": 0.3318745461627844,
|
|
"learning_rate": 3.7709595603776245e-06,
|
|
"loss": 1.7947,
|
|
"mean_token_accuracy": 0.6386385202407837,
|
|
"num_tokens": 363189485.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 1.73671875,
|
|
"epoch": 0.5384720513871806,
|
|
"grad_norm": 0.22031221838501192,
|
|
"learning_rate": 3.767436945188108e-06,
|
|
"loss": 1.7461,
|
|
"mean_token_accuracy": 0.6409023463726043,
|
|
"num_tokens": 364095833.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.5398387317206506,
|
|
"grad_norm": 0.22904265583546085,
|
|
"learning_rate": 3.7639143299985916e-06,
|
|
"loss": 1.6656,
|
|
"mean_token_accuracy": 0.6553525030612946,
|
|
"num_tokens": 365015444.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.5412054120541205,
|
|
"grad_norm": 0.19160524117517025,
|
|
"learning_rate": 3.7603917148090747e-06,
|
|
"loss": 1.7243,
|
|
"mean_token_accuracy": 0.6460123419761657,
|
|
"num_tokens": 365939690.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.5425720923875905,
|
|
"grad_norm": 0.2404530716765565,
|
|
"learning_rate": 3.756869099619558e-06,
|
|
"loss": 1.6896,
|
|
"mean_token_accuracy": 0.653221583366394,
|
|
"num_tokens": 366874816.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.5439387727210605,
|
|
"grad_norm": 0.2998658795865026,
|
|
"learning_rate": 3.753346484430041e-06,
|
|
"loss": 1.6967,
|
|
"mean_token_accuracy": 0.6510720670223236,
|
|
"num_tokens": 367790589.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.5453054530545306,
|
|
"grad_norm": 0.21052796475727809,
|
|
"learning_rate": 3.749823869240524e-06,
|
|
"loss": 1.6367,
|
|
"mean_token_accuracy": 0.6604826748371124,
|
|
"num_tokens": 368700832.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.5466721333880006,
|
|
"grad_norm": 0.21535091614835272,
|
|
"learning_rate": 3.746301254051008e-06,
|
|
"loss": 1.67,
|
|
"mean_token_accuracy": 0.6563111841678619,
|
|
"num_tokens": 369616242.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.5480388137214706,
|
|
"grad_norm": 0.2616966601540361,
|
|
"learning_rate": 3.742778638861491e-06,
|
|
"loss": 1.6343,
|
|
"mean_token_accuracy": 0.6562695384025574,
|
|
"num_tokens": 370524916.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.5494054940549405,
|
|
"grad_norm": 0.24060746351004628,
|
|
"learning_rate": 3.7392560236719743e-06,
|
|
"loss": 1.7223,
|
|
"mean_token_accuracy": 0.6460127353668212,
|
|
"num_tokens": 371404934.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 0.5507721743884105,
|
|
"grad_norm": 0.22094522589664575,
|
|
"learning_rate": 3.7357334084824574e-06,
|
|
"loss": 1.5947,
|
|
"mean_token_accuracy": 0.6669115722179413,
|
|
"num_tokens": 372321799.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.5521388547218805,
|
|
"grad_norm": 0.4241061720449766,
|
|
"learning_rate": 3.732210793292941e-06,
|
|
"loss": 1.6768,
|
|
"mean_token_accuracy": 0.6541450142860412,
|
|
"num_tokens": 373250508.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 1.76953125,
|
|
"epoch": 0.5535055350553506,
|
|
"grad_norm": 0.18860244851944485,
|
|
"learning_rate": 3.7286881781034245e-06,
|
|
"loss": 1.7734,
|
|
"mean_token_accuracy": 0.6379761457443237,
|
|
"num_tokens": 374193398.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.5548722153888206,
|
|
"grad_norm": 0.30451806956816824,
|
|
"learning_rate": 3.7251655629139076e-06,
|
|
"loss": 1.7119,
|
|
"mean_token_accuracy": 0.6494364321231842,
|
|
"num_tokens": 375146899.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.5562388957222906,
|
|
"grad_norm": 0.2057704320523261,
|
|
"learning_rate": 3.7216429477243907e-06,
|
|
"loss": 1.7165,
|
|
"mean_token_accuracy": 0.6448180973529816,
|
|
"num_tokens": 376072161.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 0.5576055760557606,
|
|
"grad_norm": 0.4008578939700692,
|
|
"learning_rate": 3.7181203325348743e-06,
|
|
"loss": 1.7174,
|
|
"mean_token_accuracy": 0.648190951347351,
|
|
"num_tokens": 377038564.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 1.81484375,
|
|
"epoch": 0.5589722563892305,
|
|
"grad_norm": 0.22470298923196702,
|
|
"learning_rate": 3.7145977173453574e-06,
|
|
"loss": 1.8074,
|
|
"mean_token_accuracy": 0.6337240099906921,
|
|
"num_tokens": 377934841.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 0.5603389367227005,
|
|
"grad_norm": 0.2238216027665433,
|
|
"learning_rate": 3.7110751021558405e-06,
|
|
"loss": 1.6893,
|
|
"mean_token_accuracy": 0.6508188366889953,
|
|
"num_tokens": 378834104.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.5617056170561706,
|
|
"grad_norm": 0.21609286213863219,
|
|
"learning_rate": 3.7075524869663245e-06,
|
|
"loss": 1.6617,
|
|
"mean_token_accuracy": 0.6519636511802673,
|
|
"num_tokens": 379788082.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.5630722973896406,
|
|
"grad_norm": 0.3014444184599367,
|
|
"learning_rate": 3.7040298717768076e-06,
|
|
"loss": 1.6877,
|
|
"mean_token_accuracy": 0.6522670030593872,
|
|
"num_tokens": 380736739.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 0.5644389777231106,
|
|
"grad_norm": 0.5834533386086772,
|
|
"learning_rate": 3.7005072565872908e-06,
|
|
"loss": 1.7057,
|
|
"mean_token_accuracy": 0.6473897337913513,
|
|
"num_tokens": 381695370.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.5658056580565806,
|
|
"grad_norm": 0.3399628044779758,
|
|
"learning_rate": 3.696984641397774e-06,
|
|
"loss": 1.6797,
|
|
"mean_token_accuracy": 0.6516806840896606,
|
|
"num_tokens": 382611634.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.5671723383900505,
|
|
"grad_norm": 0.33931943523543295,
|
|
"learning_rate": 3.693462026208257e-06,
|
|
"loss": 1.6345,
|
|
"mean_token_accuracy": 0.659611165523529,
|
|
"num_tokens": 383533791.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.5685390187235205,
|
|
"grad_norm": 0.21581430422059183,
|
|
"learning_rate": 3.689939411018741e-06,
|
|
"loss": 1.7302,
|
|
"mean_token_accuracy": 0.6492313265800476,
|
|
"num_tokens": 384499584.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.5699056990569906,
|
|
"grad_norm": 0.3913218377671697,
|
|
"learning_rate": 3.686416795829224e-06,
|
|
"loss": 1.6177,
|
|
"mean_token_accuracy": 0.6616530418395996,
|
|
"num_tokens": 385411287.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 0.5712723793904606,
|
|
"grad_norm": 0.2105405330526442,
|
|
"learning_rate": 3.6828941806397072e-06,
|
|
"loss": 1.7183,
|
|
"mean_token_accuracy": 0.6481317818164826,
|
|
"num_tokens": 386303109.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 0.5726390597239306,
|
|
"grad_norm": 0.34593921882803574,
|
|
"learning_rate": 3.6793715654501903e-06,
|
|
"loss": 1.7028,
|
|
"mean_token_accuracy": 0.6485572576522827,
|
|
"num_tokens": 387245146.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.5740057400574006,
|
|
"grad_norm": 0.40715983810068124,
|
|
"learning_rate": 3.6758489502606735e-06,
|
|
"loss": 1.7045,
|
|
"mean_token_accuracy": 0.6528811275959014,
|
|
"num_tokens": 388221243.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 0.5753724203908706,
|
|
"grad_norm": 0.183027748916355,
|
|
"learning_rate": 3.672326335071157e-06,
|
|
"loss": 1.6721,
|
|
"mean_token_accuracy": 0.6541451334953308,
|
|
"num_tokens": 389119685.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.5767391007243405,
|
|
"grad_norm": 0.3371473482698261,
|
|
"learning_rate": 3.6688037198816406e-06,
|
|
"loss": 1.6858,
|
|
"mean_token_accuracy": 0.6541908562183381,
|
|
"num_tokens": 390035308.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.5781057810578106,
|
|
"grad_norm": 0.2362436720434539,
|
|
"learning_rate": 3.6652811046921237e-06,
|
|
"loss": 1.6764,
|
|
"mean_token_accuracy": 0.6549977660179138,
|
|
"num_tokens": 390926384.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.5794724613912806,
|
|
"grad_norm": 0.33869415956833737,
|
|
"learning_rate": 3.6617584895026072e-06,
|
|
"loss": 1.6552,
|
|
"mean_token_accuracy": 0.6564251899719238,
|
|
"num_tokens": 391814200.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 1.7828125,
|
|
"epoch": 0.5808391417247506,
|
|
"grad_norm": 0.6329270462344389,
|
|
"learning_rate": 3.6582358743130904e-06,
|
|
"loss": 1.7856,
|
|
"mean_token_accuracy": 0.6333066582679748,
|
|
"num_tokens": 392739581.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.5822058220582206,
|
|
"grad_norm": 0.19900913378058654,
|
|
"learning_rate": 3.6547132591235735e-06,
|
|
"loss": 1.6835,
|
|
"mean_token_accuracy": 0.6511561214923859,
|
|
"num_tokens": 393627810.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.5835725023916906,
|
|
"grad_norm": 0.19707046933874453,
|
|
"learning_rate": 3.651190643934057e-06,
|
|
"loss": 1.6723,
|
|
"mean_token_accuracy": 0.653937292098999,
|
|
"num_tokens": 394490297.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.5849391827251605,
|
|
"grad_norm": 0.26140435557932745,
|
|
"learning_rate": 3.6476680287445406e-06,
|
|
"loss": 1.6878,
|
|
"mean_token_accuracy": 0.6537287056446075,
|
|
"num_tokens": 395437827.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 0.5863058630586306,
|
|
"grad_norm": 0.21760686117629843,
|
|
"learning_rate": 3.6441454135550237e-06,
|
|
"loss": 1.6084,
|
|
"mean_token_accuracy": 0.6647980034351348,
|
|
"num_tokens": 396384945.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.5876725433921006,
|
|
"grad_norm": 0.20834910524714434,
|
|
"learning_rate": 3.640622798365507e-06,
|
|
"loss": 1.7077,
|
|
"mean_token_accuracy": 0.6470435321331024,
|
|
"num_tokens": 397315120.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 1.75703125,
|
|
"epoch": 0.5890392237255706,
|
|
"grad_norm": 0.18443428767629,
|
|
"learning_rate": 3.63710018317599e-06,
|
|
"loss": 1.751,
|
|
"mean_token_accuracy": 0.6398531854152679,
|
|
"num_tokens": 398245301.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 1.79609375,
|
|
"epoch": 0.5904059040590406,
|
|
"grad_norm": 0.19306982270332274,
|
|
"learning_rate": 3.633577567986473e-06,
|
|
"loss": 1.7973,
|
|
"mean_token_accuracy": 0.6350760400295258,
|
|
"num_tokens": 399147230.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.5917725843925106,
|
|
"grad_norm": 0.225851813054414,
|
|
"learning_rate": 3.630054952796957e-06,
|
|
"loss": 1.6857,
|
|
"mean_token_accuracy": 0.6524592161178588,
|
|
"num_tokens": 400040509.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 1.7640625,
|
|
"epoch": 0.5931392647259806,
|
|
"grad_norm": 0.23247973325450383,
|
|
"learning_rate": 3.62653233760744e-06,
|
|
"loss": 1.7643,
|
|
"mean_token_accuracy": 0.6392862796783447,
|
|
"num_tokens": 400965744.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 0.5945059450594506,
|
|
"grad_norm": 0.20967650340444574,
|
|
"learning_rate": 3.6230097224179233e-06,
|
|
"loss": 1.6603,
|
|
"mean_token_accuracy": 0.6581998705863953,
|
|
"num_tokens": 401779502.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.5958726253929206,
|
|
"grad_norm": 0.2246934651169307,
|
|
"learning_rate": 3.6194871072284064e-06,
|
|
"loss": 1.6962,
|
|
"mean_token_accuracy": 0.6502827286720276,
|
|
"num_tokens": 402719455.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.5972393057263906,
|
|
"grad_norm": 0.22828413394924688,
|
|
"learning_rate": 3.61596449203889e-06,
|
|
"loss": 1.7191,
|
|
"mean_token_accuracy": 0.647072821855545,
|
|
"num_tokens": 403639016.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 0.5986059860598606,
|
|
"grad_norm": 0.30078991147337353,
|
|
"learning_rate": 3.6124418768493735e-06,
|
|
"loss": 1.5883,
|
|
"mean_token_accuracy": 0.6675835192203522,
|
|
"num_tokens": 404548467.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 1.75859375,
|
|
"epoch": 0.5999726663933306,
|
|
"grad_norm": 0.2566992792494114,
|
|
"learning_rate": 3.6089192616598566e-06,
|
|
"loss": 1.769,
|
|
"mean_token_accuracy": 0.640259999036789,
|
|
"num_tokens": 405533941.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 1.73828125,
|
|
"epoch": 0.6013393467268006,
|
|
"grad_norm": 0.2541275054771533,
|
|
"learning_rate": 3.6053966464703398e-06,
|
|
"loss": 1.7488,
|
|
"mean_token_accuracy": 0.6425503611564636,
|
|
"num_tokens": 406430158.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 1.7703125,
|
|
"epoch": 0.6027060270602707,
|
|
"grad_norm": 0.2173097363658744,
|
|
"learning_rate": 3.6018740312808233e-06,
|
|
"loss": 1.7834,
|
|
"mean_token_accuracy": 0.6364037096500397,
|
|
"num_tokens": 407418490.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 1.70390625,
|
|
"epoch": 0.6040727073937406,
|
|
"grad_norm": 0.23148317619225067,
|
|
"learning_rate": 3.5983514160913064e-06,
|
|
"loss": 1.7154,
|
|
"mean_token_accuracy": 0.6436924993991852,
|
|
"num_tokens": 408358945.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.6054393877272106,
|
|
"grad_norm": 0.18434151122951298,
|
|
"learning_rate": 3.5948288009017895e-06,
|
|
"loss": 1.7126,
|
|
"mean_token_accuracy": 0.6455449044704438,
|
|
"num_tokens": 409287305.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 0.6068060680606806,
|
|
"grad_norm": 0.19904282835525552,
|
|
"learning_rate": 3.591306185712273e-06,
|
|
"loss": 1.6948,
|
|
"mean_token_accuracy": 0.6497150123119354,
|
|
"num_tokens": 410205183.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 1.7984375,
|
|
"epoch": 0.6081727483941506,
|
|
"grad_norm": 0.27623548180417706,
|
|
"learning_rate": 3.5877835705227566e-06,
|
|
"loss": 1.7899,
|
|
"mean_token_accuracy": 0.6360922396183014,
|
|
"num_tokens": 411105275.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.6095394287276206,
|
|
"grad_norm": 0.32276386944544777,
|
|
"learning_rate": 3.5842609553332398e-06,
|
|
"loss": 1.6928,
|
|
"mean_token_accuracy": 0.6517738580703736,
|
|
"num_tokens": 412024681.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 1.740625,
|
|
"epoch": 0.6109061090610907,
|
|
"grad_norm": 0.28603648979257906,
|
|
"learning_rate": 3.580738340143723e-06,
|
|
"loss": 1.7528,
|
|
"mean_token_accuracy": 0.6412916004657745,
|
|
"num_tokens": 412940225.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 0.6122727893945606,
|
|
"grad_norm": 0.34584868158521204,
|
|
"learning_rate": 3.577215724954206e-06,
|
|
"loss": 1.712,
|
|
"mean_token_accuracy": 0.6492802143096924,
|
|
"num_tokens": 413896456.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.6136394697280306,
|
|
"grad_norm": 0.17527238662124317,
|
|
"learning_rate": 3.57369310976469e-06,
|
|
"loss": 1.7587,
|
|
"mean_token_accuracy": 0.6386999070644379,
|
|
"num_tokens": 414778486.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.6150061500615006,
|
|
"grad_norm": 0.1385309659039619,
|
|
"learning_rate": 3.570170494575173e-06,
|
|
"loss": 1.7027,
|
|
"mean_token_accuracy": 0.6517612159252166,
|
|
"num_tokens": 415726092.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 1.79921875,
|
|
"epoch": 0.6163728303949706,
|
|
"grad_norm": 0.21920626558180265,
|
|
"learning_rate": 3.5666478793856562e-06,
|
|
"loss": 1.8233,
|
|
"mean_token_accuracy": 0.6307761251926423,
|
|
"num_tokens": 416632200.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 0.6177395107284406,
|
|
"grad_norm": 0.45772499905964725,
|
|
"learning_rate": 3.5631252641961394e-06,
|
|
"loss": 1.6961,
|
|
"mean_token_accuracy": 0.651484876871109,
|
|
"num_tokens": 417520457.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 0.6191061910619107,
|
|
"grad_norm": 0.24089803899243842,
|
|
"learning_rate": 3.5596026490066225e-06,
|
|
"loss": 1.6457,
|
|
"mean_token_accuracy": 0.6571079015731811,
|
|
"num_tokens": 418468294.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.6204728713953807,
|
|
"grad_norm": 0.22773114552512744,
|
|
"learning_rate": 3.556080033817106e-06,
|
|
"loss": 1.6824,
|
|
"mean_token_accuracy": 0.6536483943462372,
|
|
"num_tokens": 419362385.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 1.7640625,
|
|
"epoch": 0.6218395517288506,
|
|
"grad_norm": 0.19612841230755299,
|
|
"learning_rate": 3.5525574186275896e-06,
|
|
"loss": 1.7558,
|
|
"mean_token_accuracy": 0.6409558653831482,
|
|
"num_tokens": 420251847.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.6232062320623206,
|
|
"grad_norm": 0.3785504668324759,
|
|
"learning_rate": 3.5490348034380727e-06,
|
|
"loss": 1.7553,
|
|
"mean_token_accuracy": 0.6402572393417358,
|
|
"num_tokens": 421183115.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.6245729123957906,
|
|
"grad_norm": 0.218021258186934,
|
|
"learning_rate": 3.545512188248556e-06,
|
|
"loss": 1.7128,
|
|
"mean_token_accuracy": 0.6464867770671845,
|
|
"num_tokens": 422078876.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 0.6259395927292606,
|
|
"grad_norm": 0.213007356584177,
|
|
"learning_rate": 3.5419895730590394e-06,
|
|
"loss": 1.6512,
|
|
"mean_token_accuracy": 0.6550611317157745,
|
|
"num_tokens": 422966194.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.6273062730627307,
|
|
"grad_norm": 0.24475308858872089,
|
|
"learning_rate": 3.5384669578695225e-06,
|
|
"loss": 1.7074,
|
|
"mean_token_accuracy": 0.6475019216537475,
|
|
"num_tokens": 423878054.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 1.7578125,
|
|
"epoch": 0.6286729533962007,
|
|
"grad_norm": 0.2406164314404718,
|
|
"learning_rate": 3.534944342680006e-06,
|
|
"loss": 1.7539,
|
|
"mean_token_accuracy": 0.6422311127185821,
|
|
"num_tokens": 424831275.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 0.6300396337296706,
|
|
"grad_norm": 0.26625620201339567,
|
|
"learning_rate": 3.5314217274904896e-06,
|
|
"loss": 1.6574,
|
|
"mean_token_accuracy": 0.652622389793396,
|
|
"num_tokens": 425757466.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.6314063140631406,
|
|
"grad_norm": 0.32566342113931546,
|
|
"learning_rate": 3.5278991123009727e-06,
|
|
"loss": 1.7302,
|
|
"mean_token_accuracy": 0.6458707928657532,
|
|
"num_tokens": 426670087.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 1.73828125,
|
|
"epoch": 0.6327729943966106,
|
|
"grad_norm": 0.30940877941226796,
|
|
"learning_rate": 3.524376497111456e-06,
|
|
"loss": 1.7533,
|
|
"mean_token_accuracy": 0.6379826724529266,
|
|
"num_tokens": 427615158.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.6341396747300806,
|
|
"grad_norm": 0.33471804162081353,
|
|
"learning_rate": 3.520853881921939e-06,
|
|
"loss": 1.7079,
|
|
"mean_token_accuracy": 0.6494530260562896,
|
|
"num_tokens": 428530062.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 1.71953125,
|
|
"epoch": 0.6355063550635507,
|
|
"grad_norm": 0.21388307680626217,
|
|
"learning_rate": 3.517331266732422e-06,
|
|
"loss": 1.7312,
|
|
"mean_token_accuracy": 0.64705730676651,
|
|
"num_tokens": 429487952.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 1.78125,
|
|
"epoch": 0.6368730353970207,
|
|
"grad_norm": 0.23784617211361717,
|
|
"learning_rate": 3.513808651542906e-06,
|
|
"loss": 1.7946,
|
|
"mean_token_accuracy": 0.6389595568180084,
|
|
"num_tokens": 430398599.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.6382397157304907,
|
|
"grad_norm": 0.2964048636126946,
|
|
"learning_rate": 3.510286036353389e-06,
|
|
"loss": 1.6514,
|
|
"mean_token_accuracy": 0.6572791814804078,
|
|
"num_tokens": 431265855.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 0.6396063960639606,
|
|
"grad_norm": 0.2312501926191515,
|
|
"learning_rate": 3.5067634211638723e-06,
|
|
"loss": 1.6823,
|
|
"mean_token_accuracy": 0.652709698677063,
|
|
"num_tokens": 432227400.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 0.6409730763974306,
|
|
"grad_norm": 0.1785450805439902,
|
|
"learning_rate": 3.5032408059743554e-06,
|
|
"loss": 1.6835,
|
|
"mean_token_accuracy": 0.6507722556591033,
|
|
"num_tokens": 433164870.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 0.6423397567309006,
|
|
"grad_norm": 0.1857431016180987,
|
|
"learning_rate": 3.499718190784839e-06,
|
|
"loss": 1.7078,
|
|
"mean_token_accuracy": 0.6474222242832184,
|
|
"num_tokens": 434040769.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 0.6437064370643707,
|
|
"grad_norm": 0.22904889179301793,
|
|
"learning_rate": 3.4961955755953225e-06,
|
|
"loss": 1.7273,
|
|
"mean_token_accuracy": 0.6459406793117524,
|
|
"num_tokens": 434968211.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 1.75703125,
|
|
"epoch": 0.6450731173978407,
|
|
"grad_norm": 0.5338243985267983,
|
|
"learning_rate": 3.4926729604058056e-06,
|
|
"loss": 1.7522,
|
|
"mean_token_accuracy": 0.6433603048324585,
|
|
"num_tokens": 435883022.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.6464397977313107,
|
|
"grad_norm": 0.20591273494238915,
|
|
"learning_rate": 3.4891503452162888e-06,
|
|
"loss": 1.6896,
|
|
"mean_token_accuracy": 0.6533815860748291,
|
|
"num_tokens": 436755247.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 0.6478064780647806,
|
|
"grad_norm": 0.3304901331203685,
|
|
"learning_rate": 3.4856277300267723e-06,
|
|
"loss": 1.662,
|
|
"mean_token_accuracy": 0.6551942586898803,
|
|
"num_tokens": 437650684.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.6491731583982506,
|
|
"grad_norm": 0.18178647431037462,
|
|
"learning_rate": 3.4821051148372554e-06,
|
|
"loss": 1.6918,
|
|
"mean_token_accuracy": 0.649263072013855,
|
|
"num_tokens": 438548846.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 0.6505398387317206,
|
|
"grad_norm": 0.26843615535087584,
|
|
"learning_rate": 3.4785824996477386e-06,
|
|
"loss": 1.6127,
|
|
"mean_token_accuracy": 0.6607375383377075,
|
|
"num_tokens": 439428931.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 0.6519065190651907,
|
|
"grad_norm": 0.1796181050924724,
|
|
"learning_rate": 3.475059884458222e-06,
|
|
"loss": 1.6223,
|
|
"mean_token_accuracy": 0.6597198247909546,
|
|
"num_tokens": 440339961.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 0.6532731993986607,
|
|
"grad_norm": 0.3864206021226985,
|
|
"learning_rate": 3.4715372692687057e-06,
|
|
"loss": 1.6416,
|
|
"mean_token_accuracy": 0.6581093549728394,
|
|
"num_tokens": 441219394.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.6546398797321307,
|
|
"grad_norm": 0.32352651703340296,
|
|
"learning_rate": 3.4680146540791888e-06,
|
|
"loss": 1.7339,
|
|
"mean_token_accuracy": 0.643412035703659,
|
|
"num_tokens": 442157850.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.6560065600656007,
|
|
"grad_norm": 0.21591686943455582,
|
|
"learning_rate": 3.464492038889672e-06,
|
|
"loss": 1.7144,
|
|
"mean_token_accuracy": 0.6481489181518555,
|
|
"num_tokens": 443030450.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.6573732403990706,
|
|
"grad_norm": 0.2150787211766561,
|
|
"learning_rate": 3.460969423700155e-06,
|
|
"loss": 1.7347,
|
|
"mean_token_accuracy": 0.6422638714313507,
|
|
"num_tokens": 444000622.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 0.6587399207325406,
|
|
"grad_norm": 0.1623684752755695,
|
|
"learning_rate": 3.457446808510639e-06,
|
|
"loss": 1.6886,
|
|
"mean_token_accuracy": 0.6517220199108124,
|
|
"num_tokens": 444938932.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.6601066010660107,
|
|
"grad_norm": 0.39510894329983964,
|
|
"learning_rate": 3.453924193321122e-06,
|
|
"loss": 1.6863,
|
|
"mean_token_accuracy": 0.6509363651275635,
|
|
"num_tokens": 445901681.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 0.6614732813994807,
|
|
"grad_norm": 0.2180005057943921,
|
|
"learning_rate": 3.4504015781316052e-06,
|
|
"loss": 1.6308,
|
|
"mean_token_accuracy": 0.6607716679573059,
|
|
"num_tokens": 446853313.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 1.75625,
|
|
"epoch": 0.6628399617329507,
|
|
"grad_norm": 0.2865385351523113,
|
|
"learning_rate": 3.4468789629420884e-06,
|
|
"loss": 1.7725,
|
|
"mean_token_accuracy": 0.6392990112304687,
|
|
"num_tokens": 447838229.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 0.6642066420664207,
|
|
"grad_norm": 0.2808200406316363,
|
|
"learning_rate": 3.4433563477525715e-06,
|
|
"loss": 1.6622,
|
|
"mean_token_accuracy": 0.6573486387729645,
|
|
"num_tokens": 448755093.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 0.6655733223998906,
|
|
"grad_norm": 0.1794132559194482,
|
|
"learning_rate": 3.439833732563055e-06,
|
|
"loss": 1.6691,
|
|
"mean_token_accuracy": 0.6535703420639039,
|
|
"num_tokens": 449667782.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.6669400027333606,
|
|
"grad_norm": 0.4009759662515757,
|
|
"learning_rate": 3.4363111173735386e-06,
|
|
"loss": 1.7407,
|
|
"mean_token_accuracy": 0.6423938572406769,
|
|
"num_tokens": 450632308.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.6683066830668307,
|
|
"grad_norm": 0.22316812777315928,
|
|
"learning_rate": 3.4327885021840217e-06,
|
|
"loss": 1.7165,
|
|
"mean_token_accuracy": 0.6491022408008575,
|
|
"num_tokens": 451577185.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.6696733634003007,
|
|
"grad_norm": 0.17743828374528495,
|
|
"learning_rate": 3.429265886994505e-06,
|
|
"loss": 1.7246,
|
|
"mean_token_accuracy": 0.6462555825710297,
|
|
"num_tokens": 452536865.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 0.6710400437337707,
|
|
"grad_norm": 0.3600212467988076,
|
|
"learning_rate": 3.4257432718049884e-06,
|
|
"loss": 1.6788,
|
|
"mean_token_accuracy": 0.6530853152275086,
|
|
"num_tokens": 453483309.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.6724067240672407,
|
|
"grad_norm": 0.23844436319203505,
|
|
"learning_rate": 3.4222206566154715e-06,
|
|
"loss": 1.7086,
|
|
"mean_token_accuracy": 0.6500102460384369,
|
|
"num_tokens": 454442112.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 1.7421875,
|
|
"epoch": 0.6737734044007107,
|
|
"grad_norm": 0.20605498676137388,
|
|
"learning_rate": 3.418698041425955e-06,
|
|
"loss": 1.7478,
|
|
"mean_token_accuracy": 0.639013010263443,
|
|
"num_tokens": 455350450.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 0.6751400847341806,
|
|
"grad_norm": 0.18038073717583683,
|
|
"learning_rate": 3.415175426236438e-06,
|
|
"loss": 1.6487,
|
|
"mean_token_accuracy": 0.6585165798664093,
|
|
"num_tokens": 456300216.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 0.6765067650676507,
|
|
"grad_norm": 0.29571061887441913,
|
|
"learning_rate": 3.4116528110469217e-06,
|
|
"loss": 1.5839,
|
|
"mean_token_accuracy": 0.6715575814247131,
|
|
"num_tokens": 457228479.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 0.6778734454011207,
|
|
"grad_norm": 0.18144113323372812,
|
|
"learning_rate": 3.408130195857405e-06,
|
|
"loss": 1.626,
|
|
"mean_token_accuracy": 0.6617911398410797,
|
|
"num_tokens": 458107823.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.6792401257345907,
|
|
"grad_norm": 0.19936202843290346,
|
|
"learning_rate": 3.404607580667888e-06,
|
|
"loss": 1.6796,
|
|
"mean_token_accuracy": 0.6530916094779968,
|
|
"num_tokens": 459041764.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 0.6806068060680607,
|
|
"grad_norm": 0.2098379610415125,
|
|
"learning_rate": 3.401084965478371e-06,
|
|
"loss": 1.6112,
|
|
"mean_token_accuracy": 0.6645114123821259,
|
|
"num_tokens": 459944825.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.6819734864015307,
|
|
"grad_norm": 0.18859474599064338,
|
|
"learning_rate": 3.397562350288855e-06,
|
|
"loss": 1.7209,
|
|
"mean_token_accuracy": 0.6473674595355987,
|
|
"num_tokens": 460873922.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.6833401667350006,
|
|
"grad_norm": 0.2213408010963225,
|
|
"learning_rate": 3.394039735099338e-06,
|
|
"loss": 1.6936,
|
|
"mean_token_accuracy": 0.6486734032630921,
|
|
"num_tokens": 461798069.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 0.6847068470684707,
|
|
"grad_norm": 0.25312358332462964,
|
|
"learning_rate": 3.3905171199098213e-06,
|
|
"loss": 1.6041,
|
|
"mean_token_accuracy": 0.6623551249504089,
|
|
"num_tokens": 462707061.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 0.6860735274019407,
|
|
"grad_norm": 0.1861486069530759,
|
|
"learning_rate": 3.3869945047203044e-06,
|
|
"loss": 1.647,
|
|
"mean_token_accuracy": 0.6549373209476471,
|
|
"num_tokens": 463652650.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.6874402077354107,
|
|
"grad_norm": 0.28817620394802734,
|
|
"learning_rate": 3.3834718895307876e-06,
|
|
"loss": 1.7108,
|
|
"mean_token_accuracy": 0.6474705040454865,
|
|
"num_tokens": 464558485.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 0.6888068880688807,
|
|
"grad_norm": 0.2557942307338885,
|
|
"learning_rate": 3.3799492743412715e-06,
|
|
"loss": 1.686,
|
|
"mean_token_accuracy": 0.6529182732105255,
|
|
"num_tokens": 465486995.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 0.6901735684023507,
|
|
"grad_norm": 0.24270652230869288,
|
|
"learning_rate": 3.3764266591517547e-06,
|
|
"loss": 1.6275,
|
|
"mean_token_accuracy": 0.6597605347633362,
|
|
"num_tokens": 466423274.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.6915402487358207,
|
|
"grad_norm": 0.21058706455333273,
|
|
"learning_rate": 3.3729040439622378e-06,
|
|
"loss": 1.6552,
|
|
"mean_token_accuracy": 0.653503155708313,
|
|
"num_tokens": 467353785.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 1.734375,
|
|
"epoch": 0.6929069290692907,
|
|
"grad_norm": 0.28570091534021774,
|
|
"learning_rate": 3.3693814287727213e-06,
|
|
"loss": 1.7401,
|
|
"mean_token_accuracy": 0.6428472816944122,
|
|
"num_tokens": 468250452.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 1.71796875,
|
|
"epoch": 0.6942736094027607,
|
|
"grad_norm": 0.3248185646149371,
|
|
"learning_rate": 3.3658588135832044e-06,
|
|
"loss": 1.711,
|
|
"mean_token_accuracy": 0.6478906393051147,
|
|
"num_tokens": 469238815.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 0.6956402897362307,
|
|
"grad_norm": 0.23414570534630375,
|
|
"learning_rate": 3.3623361983936876e-06,
|
|
"loss": 1.6644,
|
|
"mean_token_accuracy": 0.6560370028018951,
|
|
"num_tokens": 470233127.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.6970069700697007,
|
|
"grad_norm": 0.19445353089944964,
|
|
"learning_rate": 3.358813583204171e-06,
|
|
"loss": 1.7041,
|
|
"mean_token_accuracy": 0.6490730345249176,
|
|
"num_tokens": 471210133.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 1.78203125,
|
|
"epoch": 0.6983736504031707,
|
|
"grad_norm": 0.3365488218887799,
|
|
"learning_rate": 3.3552909680146547e-06,
|
|
"loss": 1.7792,
|
|
"mean_token_accuracy": 0.6369401752948761,
|
|
"num_tokens": 472133584.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 0.6997403307366407,
|
|
"grad_norm": 0.5394382273230122,
|
|
"learning_rate": 3.3517683528251378e-06,
|
|
"loss": 1.6377,
|
|
"mean_token_accuracy": 0.6588806271553039,
|
|
"num_tokens": 473064020.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 0.7011070110701108,
|
|
"grad_norm": 0.17007117965122664,
|
|
"learning_rate": 3.348245737635621e-06,
|
|
"loss": 1.6468,
|
|
"mean_token_accuracy": 0.6568370223045349,
|
|
"num_tokens": 474009584.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.7024736914035807,
|
|
"grad_norm": 0.17717175772404697,
|
|
"learning_rate": 3.344723122446104e-06,
|
|
"loss": 1.6997,
|
|
"mean_token_accuracy": 0.6479429066181183,
|
|
"num_tokens": 474900206.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 0.7038403717370507,
|
|
"grad_norm": 0.1918191359961147,
|
|
"learning_rate": 3.341200507256587e-06,
|
|
"loss": 1.7162,
|
|
"mean_token_accuracy": 0.6495449900627136,
|
|
"num_tokens": 475818347.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 1.74140625,
|
|
"epoch": 0.7052070520705207,
|
|
"grad_norm": 0.19538647322899894,
|
|
"learning_rate": 3.337677892067071e-06,
|
|
"loss": 1.7451,
|
|
"mean_token_accuracy": 0.643576318025589,
|
|
"num_tokens": 476726068.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 1.73671875,
|
|
"epoch": 0.7065737324039907,
|
|
"grad_norm": 0.3859579101516492,
|
|
"learning_rate": 3.3341552768775543e-06,
|
|
"loss": 1.7346,
|
|
"mean_token_accuracy": 0.6418679416179657,
|
|
"num_tokens": 477653445.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 1.70234375,
|
|
"epoch": 0.7079404127374607,
|
|
"grad_norm": 0.3469873315794702,
|
|
"learning_rate": 3.3306326616880374e-06,
|
|
"loss": 1.705,
|
|
"mean_token_accuracy": 0.6493131935596466,
|
|
"num_tokens": 478579246.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 1.76171875,
|
|
"epoch": 0.7093070930709308,
|
|
"grad_norm": 0.23509563757123544,
|
|
"learning_rate": 3.3271100464985205e-06,
|
|
"loss": 1.7553,
|
|
"mean_token_accuracy": 0.6415084540843964,
|
|
"num_tokens": 479517246.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 0.7106737734044007,
|
|
"grad_norm": 0.25033435282354216,
|
|
"learning_rate": 3.323587431309004e-06,
|
|
"loss": 1.6589,
|
|
"mean_token_accuracy": 0.6573800504207611,
|
|
"num_tokens": 480480522.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 1.74453125,
|
|
"epoch": 0.7120404537378707,
|
|
"grad_norm": 0.19555414259296353,
|
|
"learning_rate": 3.3200648161194876e-06,
|
|
"loss": 1.7251,
|
|
"mean_token_accuracy": 0.6435604095458984,
|
|
"num_tokens": 481412727.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.7134071340713407,
|
|
"grad_norm": 0.23399541060039428,
|
|
"learning_rate": 3.3165422009299707e-06,
|
|
"loss": 1.6562,
|
|
"mean_token_accuracy": 0.6550616204738617,
|
|
"num_tokens": 482340840.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.7147738144048107,
|
|
"grad_norm": 0.19991348720681507,
|
|
"learning_rate": 3.313019585740454e-06,
|
|
"loss": 1.687,
|
|
"mean_token_accuracy": 0.6502235293388366,
|
|
"num_tokens": 483263001.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 1.7484375,
|
|
"epoch": 0.7161404947382807,
|
|
"grad_norm": 0.24238682492590025,
|
|
"learning_rate": 3.3094969705509374e-06,
|
|
"loss": 1.7553,
|
|
"mean_token_accuracy": 0.6406795084476471,
|
|
"num_tokens": 484188391.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 0.7175071750717508,
|
|
"grad_norm": 0.14482175740050088,
|
|
"learning_rate": 3.3059743553614205e-06,
|
|
"loss": 1.6553,
|
|
"mean_token_accuracy": 0.6538036942481995,
|
|
"num_tokens": 485098911.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.7188738554052208,
|
|
"grad_norm": 0.23721661467078098,
|
|
"learning_rate": 3.3024517401719036e-06,
|
|
"loss": 1.6613,
|
|
"mean_token_accuracy": 0.653850382566452,
|
|
"num_tokens": 486001337.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 0.7202405357386907,
|
|
"grad_norm": 0.23838959251340097,
|
|
"learning_rate": 3.298929124982387e-06,
|
|
"loss": 1.6759,
|
|
"mean_token_accuracy": 0.6549750328063965,
|
|
"num_tokens": 486936981.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.7216072160721607,
|
|
"grad_norm": 0.2725987766862762,
|
|
"learning_rate": 3.2954065097928707e-06,
|
|
"loss": 1.7369,
|
|
"mean_token_accuracy": 0.6457023441791534,
|
|
"num_tokens": 487881943.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.7229738964056307,
|
|
"grad_norm": 0.22642282234878602,
|
|
"learning_rate": 3.291883894603354e-06,
|
|
"loss": 1.6878,
|
|
"mean_token_accuracy": 0.6538751006126404,
|
|
"num_tokens": 488802969.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 0.7243405767391007,
|
|
"grad_norm": 0.18370806847210525,
|
|
"learning_rate": 3.288361279413837e-06,
|
|
"loss": 1.623,
|
|
"mean_token_accuracy": 0.6588943302631378,
|
|
"num_tokens": 489715738.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.7257072570725708,
|
|
"grad_norm": 0.25502526836952755,
|
|
"learning_rate": 3.28483866422432e-06,
|
|
"loss": 1.6543,
|
|
"mean_token_accuracy": 0.6579097807407379,
|
|
"num_tokens": 490628704.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 1.75546875,
|
|
"epoch": 0.7270739374060408,
|
|
"grad_norm": 0.1870213254230513,
|
|
"learning_rate": 3.281316049034804e-06,
|
|
"loss": 1.7773,
|
|
"mean_token_accuracy": 0.6337605953216553,
|
|
"num_tokens": 491605390.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 0.7284406177395107,
|
|
"grad_norm": 0.27235023213001536,
|
|
"learning_rate": 3.277793433845287e-06,
|
|
"loss": 1.6919,
|
|
"mean_token_accuracy": 0.6466127276420593,
|
|
"num_tokens": 492532452.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 0.7298072980729807,
|
|
"grad_norm": 0.25509051048194203,
|
|
"learning_rate": 3.2742708186557703e-06,
|
|
"loss": 1.6389,
|
|
"mean_token_accuracy": 0.6600806713104248,
|
|
"num_tokens": 493480971.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.7311739784064507,
|
|
"grad_norm": 0.1898748668187117,
|
|
"learning_rate": 3.2707482034662534e-06,
|
|
"loss": 1.7227,
|
|
"mean_token_accuracy": 0.6433460652828217,
|
|
"num_tokens": 494410085.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 0.7325406587399207,
|
|
"grad_norm": 0.2875200387261605,
|
|
"learning_rate": 3.2672255882767366e-06,
|
|
"loss": 1.6498,
|
|
"mean_token_accuracy": 0.6594021737575531,
|
|
"num_tokens": 495342482.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 0.7339073390733908,
|
|
"grad_norm": 0.2656258156465888,
|
|
"learning_rate": 3.26370297308722e-06,
|
|
"loss": 1.6867,
|
|
"mean_token_accuracy": 0.6523146688938141,
|
|
"num_tokens": 496297348.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.7352740194068608,
|
|
"grad_norm": 0.2010638879594976,
|
|
"learning_rate": 3.2601803578977037e-06,
|
|
"loss": 1.675,
|
|
"mean_token_accuracy": 0.6512361705303192,
|
|
"num_tokens": 497217174.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 0.7366406997403308,
|
|
"grad_norm": 0.1759652595802032,
|
|
"learning_rate": 3.2566577427081868e-06,
|
|
"loss": 1.6165,
|
|
"mean_token_accuracy": 0.6612531781196594,
|
|
"num_tokens": 498177395.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.7380073800738007,
|
|
"grad_norm": 0.21666296209357613,
|
|
"learning_rate": 3.25313512751867e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.654757434129715,
|
|
"num_tokens": 499082648.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.7393740604072707,
|
|
"grad_norm": 0.24118378888719103,
|
|
"learning_rate": 3.2496125123291535e-06,
|
|
"loss": 1.7369,
|
|
"mean_token_accuracy": 0.6413352012634277,
|
|
"num_tokens": 500003549.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 0.7407407407407407,
|
|
"grad_norm": 0.22143250465713687,
|
|
"learning_rate": 3.2460898971396366e-06,
|
|
"loss": 1.6308,
|
|
"mean_token_accuracy": 0.6614049255847931,
|
|
"num_tokens": 500935417.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.7421074210742108,
|
|
"grad_norm": 0.151940322973488,
|
|
"learning_rate": 3.24256728195012e-06,
|
|
"loss": 1.6748,
|
|
"mean_token_accuracy": 0.6531229317188263,
|
|
"num_tokens": 501845208.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 0.7434741014076808,
|
|
"grad_norm": 0.2622507831704641,
|
|
"learning_rate": 3.2390446667606037e-06,
|
|
"loss": 1.6785,
|
|
"mean_token_accuracy": 0.6535347521305084,
|
|
"num_tokens": 502800522.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 0.7448407817411508,
|
|
"grad_norm": 0.25596187257392095,
|
|
"learning_rate": 3.235522051571087e-06,
|
|
"loss": 1.6661,
|
|
"mean_token_accuracy": 0.6546599388122558,
|
|
"num_tokens": 503721058.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 0.7462074620746207,
|
|
"grad_norm": 0.23663753760490783,
|
|
"learning_rate": 3.23199943638157e-06,
|
|
"loss": 1.6919,
|
|
"mean_token_accuracy": 0.6503893554210662,
|
|
"num_tokens": 504604771.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 0.7475741424080907,
|
|
"grad_norm": 0.26197415948011116,
|
|
"learning_rate": 3.228476821192053e-06,
|
|
"loss": 1.7118,
|
|
"mean_token_accuracy": 0.6464606821537018,
|
|
"num_tokens": 505532814.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 0.7489408227415607,
|
|
"grad_norm": 0.3144236431952025,
|
|
"learning_rate": 3.224954206002536e-06,
|
|
"loss": 1.7034,
|
|
"mean_token_accuracy": 0.6477314710617066,
|
|
"num_tokens": 506489179.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.7503075030750308,
|
|
"grad_norm": 0.17753537079904383,
|
|
"learning_rate": 3.22143159081302e-06,
|
|
"loss": 1.6597,
|
|
"mean_token_accuracy": 0.6582888245582581,
|
|
"num_tokens": 507415977.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 1.7640625,
|
|
"epoch": 0.7516741834085008,
|
|
"grad_norm": 0.16559253760339865,
|
|
"learning_rate": 3.2179089756235033e-06,
|
|
"loss": 1.752,
|
|
"mean_token_accuracy": 0.6386685252189637,
|
|
"num_tokens": 508329780.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.7530408637419708,
|
|
"grad_norm": 0.3560882380897593,
|
|
"learning_rate": 3.2143863604339864e-06,
|
|
"loss": 1.7447,
|
|
"mean_token_accuracy": 0.6425258636474609,
|
|
"num_tokens": 509271018.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 0.7544075440754408,
|
|
"grad_norm": 0.17469213512151083,
|
|
"learning_rate": 3.2108637452444695e-06,
|
|
"loss": 1.7023,
|
|
"mean_token_accuracy": 0.6493143796920776,
|
|
"num_tokens": 510187295.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 1.7296875,
|
|
"epoch": 0.7557742244089107,
|
|
"grad_norm": 0.1670658441397371,
|
|
"learning_rate": 3.2073411300549526e-06,
|
|
"loss": 1.7287,
|
|
"mean_token_accuracy": 0.6471672534942627,
|
|
"num_tokens": 511095224.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 0.7571409047423807,
|
|
"grad_norm": 0.2303716670012626,
|
|
"learning_rate": 3.2038185148654366e-06,
|
|
"loss": 1.6379,
|
|
"mean_token_accuracy": 0.6598055720329284,
|
|
"num_tokens": 512008947.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.7585075850758508,
|
|
"grad_norm": 0.32082152201861763,
|
|
"learning_rate": 3.2002958996759197e-06,
|
|
"loss": 1.7275,
|
|
"mean_token_accuracy": 0.6451738357543946,
|
|
"num_tokens": 512898128.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.7598742654093208,
|
|
"grad_norm": 0.3015788411507871,
|
|
"learning_rate": 3.196773284486403e-06,
|
|
"loss": 1.7209,
|
|
"mean_token_accuracy": 0.6476833045482635,
|
|
"num_tokens": 513824504.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 1.74921875,
|
|
"epoch": 0.7612409457427908,
|
|
"grad_norm": 0.24035024520816942,
|
|
"learning_rate": 3.1932506692968864e-06,
|
|
"loss": 1.7412,
|
|
"mean_token_accuracy": 0.64242525100708,
|
|
"num_tokens": 514761292.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 0.7626076260762608,
|
|
"grad_norm": 0.19365269249921965,
|
|
"learning_rate": 3.1897280541073695e-06,
|
|
"loss": 1.6931,
|
|
"mean_token_accuracy": 0.648711484670639,
|
|
"num_tokens": 515703995.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 1.7296875,
|
|
"epoch": 0.7639743064097307,
|
|
"grad_norm": 0.205967850628241,
|
|
"learning_rate": 3.1862054389178526e-06,
|
|
"loss": 1.7378,
|
|
"mean_token_accuracy": 0.6442757904529571,
|
|
"num_tokens": 516679963.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.7653409867432007,
|
|
"grad_norm": 0.2182895005393608,
|
|
"learning_rate": 3.182682823728336e-06,
|
|
"loss": 1.6557,
|
|
"mean_token_accuracy": 0.6549648880958557,
|
|
"num_tokens": 517614363.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.7667076670766708,
|
|
"grad_norm": 0.17214741461281047,
|
|
"learning_rate": 3.1791602085388197e-06,
|
|
"loss": 1.645,
|
|
"mean_token_accuracy": 0.6578444004058838,
|
|
"num_tokens": 518550025.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.7680743474101408,
|
|
"grad_norm": 0.19150257720643848,
|
|
"learning_rate": 3.175637593349303e-06,
|
|
"loss": 1.7211,
|
|
"mean_token_accuracy": 0.6455064594745636,
|
|
"num_tokens": 519458389.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 0.7694410277436108,
|
|
"grad_norm": 0.23407667382083427,
|
|
"learning_rate": 3.172114978159786e-06,
|
|
"loss": 1.7371,
|
|
"mean_token_accuracy": 0.6430055558681488,
|
|
"num_tokens": 520405621.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.7708077080770808,
|
|
"grad_norm": 0.23277635779306546,
|
|
"learning_rate": 3.168592362970269e-06,
|
|
"loss": 1.7272,
|
|
"mean_token_accuracy": 0.6473752677440643,
|
|
"num_tokens": 521293103.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 0.7721743884105507,
|
|
"grad_norm": 0.2856375483125862,
|
|
"learning_rate": 3.165069747780753e-06,
|
|
"loss": 1.6813,
|
|
"mean_token_accuracy": 0.6528005123138427,
|
|
"num_tokens": 522187353.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 1.728125,
|
|
"epoch": 0.7735410687440207,
|
|
"grad_norm": 0.21682976886737929,
|
|
"learning_rate": 3.161547132591236e-06,
|
|
"loss": 1.7468,
|
|
"mean_token_accuracy": 0.6418661653995514,
|
|
"num_tokens": 523133109.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 0.7749077490774908,
|
|
"grad_norm": 0.24221668675122657,
|
|
"learning_rate": 3.1580245174017193e-06,
|
|
"loss": 1.7248,
|
|
"mean_token_accuracy": 0.6443256855010986,
|
|
"num_tokens": 524072890.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 1.70703125,
|
|
"epoch": 0.7762744294109608,
|
|
"grad_norm": 0.22596056764532932,
|
|
"learning_rate": 3.1545019022122025e-06,
|
|
"loss": 1.7185,
|
|
"mean_token_accuracy": 0.6479742467403412,
|
|
"num_tokens": 525002561.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 1.7546875,
|
|
"epoch": 0.7776411097444308,
|
|
"grad_norm": 0.27046561753969406,
|
|
"learning_rate": 3.1509792870226856e-06,
|
|
"loss": 1.7748,
|
|
"mean_token_accuracy": 0.6366844236850738,
|
|
"num_tokens": 525933295.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 0.7790077900779008,
|
|
"grad_norm": 0.3552026451017819,
|
|
"learning_rate": 3.147456671833169e-06,
|
|
"loss": 1.6642,
|
|
"mean_token_accuracy": 0.6539491713047028,
|
|
"num_tokens": 526844669.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.7803744704113708,
|
|
"grad_norm": 0.2147758595926868,
|
|
"learning_rate": 3.1439340566436527e-06,
|
|
"loss": 1.6597,
|
|
"mean_token_accuracy": 0.6543346762657165,
|
|
"num_tokens": 527768490.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.7817411507448407,
|
|
"grad_norm": 0.5331692862683884,
|
|
"learning_rate": 3.140411441454136e-06,
|
|
"loss": 1.7023,
|
|
"mean_token_accuracy": 0.6498912632465362,
|
|
"num_tokens": 528686430.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 0.7831078310783108,
|
|
"grad_norm": 0.21227236794457688,
|
|
"learning_rate": 3.136888826264619e-06,
|
|
"loss": 1.6935,
|
|
"mean_token_accuracy": 0.6484281659126282,
|
|
"num_tokens": 529607572.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 0.7844745114117808,
|
|
"grad_norm": 0.254365672421938,
|
|
"learning_rate": 3.1333662110751025e-06,
|
|
"loss": 1.6768,
|
|
"mean_token_accuracy": 0.6509453535079956,
|
|
"num_tokens": 530507987.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 0.7858411917452508,
|
|
"grad_norm": 0.19699154965327934,
|
|
"learning_rate": 3.1298435958855856e-06,
|
|
"loss": 1.6153,
|
|
"mean_token_accuracy": 0.6619944155216217,
|
|
"num_tokens": 531389383.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 1.7140625,
|
|
"epoch": 0.7872078720787208,
|
|
"grad_norm": 0.25848799462005934,
|
|
"learning_rate": 3.126320980696069e-06,
|
|
"loss": 1.7187,
|
|
"mean_token_accuracy": 0.6466306328773499,
|
|
"num_tokens": 532289253.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 0.7885745524121908,
|
|
"grad_norm": 0.2110147110494103,
|
|
"learning_rate": 3.1227983655065523e-06,
|
|
"loss": 1.6961,
|
|
"mean_token_accuracy": 0.6480464279651642,
|
|
"num_tokens": 533193996.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 0.7899412327456607,
|
|
"grad_norm": 0.20410057431638648,
|
|
"learning_rate": 3.119275750317036e-06,
|
|
"loss": 1.5987,
|
|
"mean_token_accuracy": 0.6614332675933838,
|
|
"num_tokens": 534115362.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 0.7913079130791308,
|
|
"grad_norm": 0.18232824567188238,
|
|
"learning_rate": 3.115753135127519e-06,
|
|
"loss": 1.7582,
|
|
"mean_token_accuracy": 0.639330393075943,
|
|
"num_tokens": 535044869.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 0.7926745934126008,
|
|
"grad_norm": 0.34096093446837167,
|
|
"learning_rate": 3.112230519938002e-06,
|
|
"loss": 1.6622,
|
|
"mean_token_accuracy": 0.6554463922977447,
|
|
"num_tokens": 536000650.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.7940412737460708,
|
|
"grad_norm": 0.3669625159061662,
|
|
"learning_rate": 3.108707904748485e-06,
|
|
"loss": 1.69,
|
|
"mean_token_accuracy": 0.6532484710216522,
|
|
"num_tokens": 536860647.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 1.7484375,
|
|
"epoch": 0.7954079540795408,
|
|
"grad_norm": 0.3416709505767769,
|
|
"learning_rate": 3.105185289558969e-06,
|
|
"loss": 1.7832,
|
|
"mean_token_accuracy": 0.6391791045665741,
|
|
"num_tokens": 537829306.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.7967746344130108,
|
|
"grad_norm": 0.34722405813342755,
|
|
"learning_rate": 3.1016626743694523e-06,
|
|
"loss": 1.7012,
|
|
"mean_token_accuracy": 0.6489062488079071,
|
|
"num_tokens": 538752933.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 0.7981413147464808,
|
|
"grad_norm": 0.3616768472890402,
|
|
"learning_rate": 3.0981400591799354e-06,
|
|
"loss": 1.6836,
|
|
"mean_token_accuracy": 0.6516611516475678,
|
|
"num_tokens": 539729835.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 0.7995079950799509,
|
|
"grad_norm": 0.2109992023707596,
|
|
"learning_rate": 3.0946174439904185e-06,
|
|
"loss": 1.662,
|
|
"mean_token_accuracy": 0.6563475370407105,
|
|
"num_tokens": 540671422.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 0.8008746754134208,
|
|
"grad_norm": 0.2365144077707054,
|
|
"learning_rate": 3.0910948288009016e-06,
|
|
"loss": 1.6148,
|
|
"mean_token_accuracy": 0.6634587347507477,
|
|
"num_tokens": 541572528.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 0.8022413557468908,
|
|
"grad_norm": 0.20499633304489484,
|
|
"learning_rate": 3.0875722136113856e-06,
|
|
"loss": 1.6854,
|
|
"mean_token_accuracy": 0.6497904300689697,
|
|
"num_tokens": 542488767.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 0.8036080360803608,
|
|
"grad_norm": 0.2093509614404467,
|
|
"learning_rate": 3.0840495984218687e-06,
|
|
"loss": 1.6791,
|
|
"mean_token_accuracy": 0.6536693871021271,
|
|
"num_tokens": 543432978.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 0.8049747164138308,
|
|
"grad_norm": 0.24094578078172124,
|
|
"learning_rate": 3.080526983232352e-06,
|
|
"loss": 1.7137,
|
|
"mean_token_accuracy": 0.6459457993507385,
|
|
"num_tokens": 544345186.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.8063413967473008,
|
|
"grad_norm": 0.3041776848847522,
|
|
"learning_rate": 3.0770043680428354e-06,
|
|
"loss": 1.6887,
|
|
"mean_token_accuracy": 0.6482310116291046,
|
|
"num_tokens": 545277032.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 0.8077080770807709,
|
|
"grad_norm": 0.21645179237408865,
|
|
"learning_rate": 3.0734817528533185e-06,
|
|
"loss": 1.6629,
|
|
"mean_token_accuracy": 0.6574439644813538,
|
|
"num_tokens": 546182592.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 0.8090747574142408,
|
|
"grad_norm": 0.1825982306442583,
|
|
"learning_rate": 3.0699591376638017e-06,
|
|
"loss": 1.6932,
|
|
"mean_token_accuracy": 0.6504955351352691,
|
|
"num_tokens": 547099932.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 0.8104414377477108,
|
|
"grad_norm": 0.252428495003178,
|
|
"learning_rate": 3.066436522474285e-06,
|
|
"loss": 1.6202,
|
|
"mean_token_accuracy": 0.6630837321281433,
|
|
"num_tokens": 548025287.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 0.8118081180811808,
|
|
"grad_norm": 0.22633950711395875,
|
|
"learning_rate": 3.0629139072847688e-06,
|
|
"loss": 1.6185,
|
|
"mean_token_accuracy": 0.6606667101383209,
|
|
"num_tokens": 548995834.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.8131747984146508,
|
|
"grad_norm": 0.2956651744087899,
|
|
"learning_rate": 3.059391292095252e-06,
|
|
"loss": 1.677,
|
|
"mean_token_accuracy": 0.6493414402008056,
|
|
"num_tokens": 549898878.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.8145414787481208,
|
|
"grad_norm": 0.1873070827394538,
|
|
"learning_rate": 3.055868676905735e-06,
|
|
"loss": 1.7355,
|
|
"mean_token_accuracy": 0.6446664452552795,
|
|
"num_tokens": 550874340.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 0.8159081590815909,
|
|
"grad_norm": 0.2287196860451835,
|
|
"learning_rate": 3.052346061716218e-06,
|
|
"loss": 1.6783,
|
|
"mean_token_accuracy": 0.6513522207736969,
|
|
"num_tokens": 551740384.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 0.8172748394150608,
|
|
"grad_norm": 0.33498111228371835,
|
|
"learning_rate": 3.048823446526702e-06,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.6478020548820496,
|
|
"num_tokens": 552676027.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 1.71015625,
|
|
"epoch": 0.8186415197485308,
|
|
"grad_norm": 0.23977297349777058,
|
|
"learning_rate": 3.0453008313371852e-06,
|
|
"loss": 1.715,
|
|
"mean_token_accuracy": 0.6459953725337982,
|
|
"num_tokens": 553645479.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 0.8200082000820008,
|
|
"grad_norm": 0.273364538053894,
|
|
"learning_rate": 3.0417782161476683e-06,
|
|
"loss": 1.6385,
|
|
"mean_token_accuracy": 0.655714499950409,
|
|
"num_tokens": 554562400.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 1.753125,
|
|
"epoch": 0.8213748804154708,
|
|
"grad_norm": 0.2837421383542773,
|
|
"learning_rate": 3.0382556009581515e-06,
|
|
"loss": 1.7773,
|
|
"mean_token_accuracy": 0.6375827670097352,
|
|
"num_tokens": 555473946.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 0.8227415607489408,
|
|
"grad_norm": 0.33898487599709703,
|
|
"learning_rate": 3.0347329857686346e-06,
|
|
"loss": 1.6269,
|
|
"mean_token_accuracy": 0.659155398607254,
|
|
"num_tokens": 556369175.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 0.8241082410824109,
|
|
"grad_norm": 0.18878135542724211,
|
|
"learning_rate": 3.031210370579118e-06,
|
|
"loss": 1.637,
|
|
"mean_token_accuracy": 0.6619251251220704,
|
|
"num_tokens": 557299105.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 1.7328125,
|
|
"epoch": 0.8254749214158809,
|
|
"grad_norm": 0.2782737038812089,
|
|
"learning_rate": 3.0276877553896017e-06,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.644311374425888,
|
|
"num_tokens": 558215699.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 0.8268416017493508,
|
|
"grad_norm": 0.18070603489985643,
|
|
"learning_rate": 3.024165140200085e-06,
|
|
"loss": 1.6715,
|
|
"mean_token_accuracy": 0.6532941818237304,
|
|
"num_tokens": 559131314.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.8282082820828208,
|
|
"grad_norm": 0.315391296962109,
|
|
"learning_rate": 3.020642525010568e-06,
|
|
"loss": 1.715,
|
|
"mean_token_accuracy": 0.6477056503295898,
|
|
"num_tokens": 560041681.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 0.8295749624162908,
|
|
"grad_norm": 0.20972516273923267,
|
|
"learning_rate": 3.0171199098210515e-06,
|
|
"loss": 1.6738,
|
|
"mean_token_accuracy": 0.6539546370506286,
|
|
"num_tokens": 560940196.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.8309416427497608,
|
|
"grad_norm": 0.19957438376098738,
|
|
"learning_rate": 3.0135972946315346e-06,
|
|
"loss": 1.6745,
|
|
"mean_token_accuracy": 0.6551594257354736,
|
|
"num_tokens": 561871694.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.8323083230832309,
|
|
"grad_norm": 0.24822072980245727,
|
|
"learning_rate": 3.010074679442018e-06,
|
|
"loss": 1.6916,
|
|
"mean_token_accuracy": 0.6511012196540833,
|
|
"num_tokens": 562771957.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 0.8336750034167009,
|
|
"grad_norm": 0.5919722806695641,
|
|
"learning_rate": 3.0065520642525013e-06,
|
|
"loss": 1.72,
|
|
"mean_token_accuracy": 0.6477966785430909,
|
|
"num_tokens": 563679602.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 1.7375,
|
|
"epoch": 0.8350416837501708,
|
|
"grad_norm": 0.4407536475212033,
|
|
"learning_rate": 3.003029449062985e-06,
|
|
"loss": 1.7417,
|
|
"mean_token_accuracy": 0.6431312084197998,
|
|
"num_tokens": 564621008.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 1.68671875,
|
|
"epoch": 0.8364083640836408,
|
|
"grad_norm": 0.29697014952891704,
|
|
"learning_rate": 2.999506833873468e-06,
|
|
"loss": 1.6762,
|
|
"mean_token_accuracy": 0.651240599155426,
|
|
"num_tokens": 565495111.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.8377750444171108,
|
|
"grad_norm": 0.20057858529867034,
|
|
"learning_rate": 2.995984218683951e-06,
|
|
"loss": 1.7176,
|
|
"mean_token_accuracy": 0.6427354276180267,
|
|
"num_tokens": 566408144.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.8391417247505808,
|
|
"grad_norm": 0.20165292920211492,
|
|
"learning_rate": 2.992461603494434e-06,
|
|
"loss": 1.6503,
|
|
"mean_token_accuracy": 0.6538468956947326,
|
|
"num_tokens": 567352436.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.8405084050840509,
|
|
"grad_norm": 0.3185555864697012,
|
|
"learning_rate": 2.988938988304918e-06,
|
|
"loss": 1.6769,
|
|
"mean_token_accuracy": 0.6519819080829621,
|
|
"num_tokens": 568298035.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 0.8418750854175209,
|
|
"grad_norm": 0.2210094257186105,
|
|
"learning_rate": 2.9854163731154013e-06,
|
|
"loss": 1.672,
|
|
"mean_token_accuracy": 0.6555897414684295,
|
|
"num_tokens": 569208485.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 0.8432417657509909,
|
|
"grad_norm": 0.44349679909405193,
|
|
"learning_rate": 2.9818937579258844e-06,
|
|
"loss": 1.6712,
|
|
"mean_token_accuracy": 0.6527296006679535,
|
|
"num_tokens": 570085273.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.8446084460844608,
|
|
"grad_norm": 0.2531400727475089,
|
|
"learning_rate": 2.9783711427363675e-06,
|
|
"loss": 1.6816,
|
|
"mean_token_accuracy": 0.6506977140903473,
|
|
"num_tokens": 571011048.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.8459751264179308,
|
|
"grad_norm": 0.27890951771025624,
|
|
"learning_rate": 2.9748485275468507e-06,
|
|
"loss": 1.6959,
|
|
"mean_token_accuracy": 0.6509178638458252,
|
|
"num_tokens": 571923821.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.8473418067514008,
|
|
"grad_norm": 0.24111894803503628,
|
|
"learning_rate": 2.9713259123573346e-06,
|
|
"loss": 1.669,
|
|
"mean_token_accuracy": 0.6526845633983612,
|
|
"num_tokens": 572801924.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.8487084870848709,
|
|
"grad_norm": 0.23095440029588118,
|
|
"learning_rate": 2.9678032971678177e-06,
|
|
"loss": 1.6417,
|
|
"mean_token_accuracy": 0.656207001209259,
|
|
"num_tokens": 573677718.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.8500751674183409,
|
|
"grad_norm": 0.2973193465202493,
|
|
"learning_rate": 2.964280681978301e-06,
|
|
"loss": 1.7054,
|
|
"mean_token_accuracy": 0.6477745831012726,
|
|
"num_tokens": 574630769.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.8514418477518109,
|
|
"grad_norm": 0.24353437909403092,
|
|
"learning_rate": 2.960758066788784e-06,
|
|
"loss": 1.6486,
|
|
"mean_token_accuracy": 0.65456782579422,
|
|
"num_tokens": 575508345.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 0.8528085280852808,
|
|
"grad_norm": 0.3929178050029559,
|
|
"learning_rate": 2.9572354515992675e-06,
|
|
"loss": 1.7154,
|
|
"mean_token_accuracy": 0.6435547113418579,
|
|
"num_tokens": 576396867.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 0.8541752084187508,
|
|
"grad_norm": 0.2386866324522821,
|
|
"learning_rate": 2.9537128364097507e-06,
|
|
"loss": 1.5779,
|
|
"mean_token_accuracy": 0.670587694644928,
|
|
"num_tokens": 577326342.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.8555418887522208,
|
|
"grad_norm": 0.21325560610107128,
|
|
"learning_rate": 2.9501902212202342e-06,
|
|
"loss": 1.6833,
|
|
"mean_token_accuracy": 0.6515741467475891,
|
|
"num_tokens": 578288893.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 0.8569085690856909,
|
|
"grad_norm": 0.3206500915723857,
|
|
"learning_rate": 2.9466676060307178e-06,
|
|
"loss": 1.685,
|
|
"mean_token_accuracy": 0.6499540507793427,
|
|
"num_tokens": 579222531.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.8582752494191609,
|
|
"grad_norm": 0.16528506588234054,
|
|
"learning_rate": 2.943144990841201e-06,
|
|
"loss": 1.7028,
|
|
"mean_token_accuracy": 0.6485298037528991,
|
|
"num_tokens": 580163517.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.8596419297526309,
|
|
"grad_norm": 0.2279274660381151,
|
|
"learning_rate": 2.939622375651684e-06,
|
|
"loss": 1.6702,
|
|
"mean_token_accuracy": 0.6565089464187622,
|
|
"num_tokens": 581088969.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.8610086100861009,
|
|
"grad_norm": 0.19536855348148308,
|
|
"learning_rate": 2.936099760462167e-06,
|
|
"loss": 1.6975,
|
|
"mean_token_accuracy": 0.6483010292053223,
|
|
"num_tokens": 582058983.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 0.8623752904195708,
|
|
"grad_norm": 0.21330608355836303,
|
|
"learning_rate": 2.932577145272651e-06,
|
|
"loss": 1.6716,
|
|
"mean_token_accuracy": 0.6550526738166809,
|
|
"num_tokens": 582956560.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 0.8637419707530408,
|
|
"grad_norm": 0.2043518703720773,
|
|
"learning_rate": 2.9290545300831342e-06,
|
|
"loss": 1.6983,
|
|
"mean_token_accuracy": 0.6503245890140533,
|
|
"num_tokens": 583905001.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 0.8651086510865109,
|
|
"grad_norm": 0.3178962010298773,
|
|
"learning_rate": 2.9255319148936174e-06,
|
|
"loss": 1.7242,
|
|
"mean_token_accuracy": 0.6475239217281341,
|
|
"num_tokens": 584829314.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 0.8664753314199809,
|
|
"grad_norm": 0.18009294995002,
|
|
"learning_rate": 2.9220092997041005e-06,
|
|
"loss": 1.5741,
|
|
"mean_token_accuracy": 0.6712906062602997,
|
|
"num_tokens": 585769008.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 0.8678420117534509,
|
|
"grad_norm": 0.17867487974473473,
|
|
"learning_rate": 2.9184866845145836e-06,
|
|
"loss": 1.6871,
|
|
"mean_token_accuracy": 0.6496051847934723,
|
|
"num_tokens": 586690275.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.8692086920869209,
|
|
"grad_norm": 0.15150908094593893,
|
|
"learning_rate": 2.9149640693250667e-06,
|
|
"loss": 1.6894,
|
|
"mean_token_accuracy": 0.6507693767547608,
|
|
"num_tokens": 587646649.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.8705753724203908,
|
|
"grad_norm": 0.22840988880472257,
|
|
"learning_rate": 2.9114414541355507e-06,
|
|
"loss": 1.6869,
|
|
"mean_token_accuracy": 0.6514085710048676,
|
|
"num_tokens": 588551294.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 0.8719420527538608,
|
|
"grad_norm": 0.19114619421938844,
|
|
"learning_rate": 2.907918838946034e-06,
|
|
"loss": 1.6857,
|
|
"mean_token_accuracy": 0.6529534220695495,
|
|
"num_tokens": 589478263.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 0.8733087330873309,
|
|
"grad_norm": 0.20383605172393285,
|
|
"learning_rate": 2.904396223756517e-06,
|
|
"loss": 1.6858,
|
|
"mean_token_accuracy": 0.6516634464263916,
|
|
"num_tokens": 590400298.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 0.8746754134208009,
|
|
"grad_norm": 0.24679477566693117,
|
|
"learning_rate": 2.9008736085670005e-06,
|
|
"loss": 1.7265,
|
|
"mean_token_accuracy": 0.6478498458862305,
|
|
"num_tokens": 591320358.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 0.8760420937542709,
|
|
"grad_norm": 0.20430890223181553,
|
|
"learning_rate": 2.8973509933774836e-06,
|
|
"loss": 1.6019,
|
|
"mean_token_accuracy": 0.6643312454223633,
|
|
"num_tokens": 592215933.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 0.8774087740877409,
|
|
"grad_norm": 0.26562515833596095,
|
|
"learning_rate": 2.893828378187967e-06,
|
|
"loss": 1.6947,
|
|
"mean_token_accuracy": 0.6501117646694183,
|
|
"num_tokens": 593158185.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 0.8787754544212109,
|
|
"grad_norm": 0.3913729052430039,
|
|
"learning_rate": 2.8903057629984503e-06,
|
|
"loss": 1.6282,
|
|
"mean_token_accuracy": 0.6589270889759063,
|
|
"num_tokens": 594059677.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 0.8801421347546808,
|
|
"grad_norm": 0.3245593059590691,
|
|
"learning_rate": 2.886783147808934e-06,
|
|
"loss": 1.6081,
|
|
"mean_token_accuracy": 0.6643651783466339,
|
|
"num_tokens": 594999787.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 0.8815088150881509,
|
|
"grad_norm": 0.20436445115132093,
|
|
"learning_rate": 2.883260532619417e-06,
|
|
"loss": 1.6941,
|
|
"mean_token_accuracy": 0.647711044549942,
|
|
"num_tokens": 595898216.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 0.8828754954216209,
|
|
"grad_norm": 1.111225107735269,
|
|
"learning_rate": 2.8797379174299e-06,
|
|
"loss": 1.6594,
|
|
"mean_token_accuracy": 0.6552018105983735,
|
|
"num_tokens": 596769879.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 0.8842421757550909,
|
|
"grad_norm": 0.2893575955091331,
|
|
"learning_rate": 2.876215302240383e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6615567564964294,
|
|
"num_tokens": 597679650.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 0.8856088560885609,
|
|
"grad_norm": 0.5496525063570306,
|
|
"learning_rate": 2.872692687050867e-06,
|
|
"loss": 1.6969,
|
|
"mean_token_accuracy": 0.6494685173034668,
|
|
"num_tokens": 598637995.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.8869755364220309,
|
|
"grad_norm": 0.2283431278672623,
|
|
"learning_rate": 2.8691700718613503e-06,
|
|
"loss": 1.7159,
|
|
"mean_token_accuracy": 0.6450701713562011,
|
|
"num_tokens": 599548259.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 0.8883422167555008,
|
|
"grad_norm": 0.26756889656722166,
|
|
"learning_rate": 2.8656474566718334e-06,
|
|
"loss": 1.6651,
|
|
"mean_token_accuracy": 0.6520837843418121,
|
|
"num_tokens": 600502090.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 0.8897088970889709,
|
|
"grad_norm": 0.19196755059062903,
|
|
"learning_rate": 2.8621248414823165e-06,
|
|
"loss": 1.6465,
|
|
"mean_token_accuracy": 0.6570260465145111,
|
|
"num_tokens": 601426253.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 0.8910755774224409,
|
|
"grad_norm": 0.2587666394027151,
|
|
"learning_rate": 2.8586022262927997e-06,
|
|
"loss": 1.6705,
|
|
"mean_token_accuracy": 0.6526997268199921,
|
|
"num_tokens": 602278868.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 1.77890625,
|
|
"epoch": 0.8924422577559109,
|
|
"grad_norm": 0.20223206670522556,
|
|
"learning_rate": 2.8550796111032836e-06,
|
|
"loss": 1.8007,
|
|
"mean_token_accuracy": 0.6338579773902893,
|
|
"num_tokens": 603211640.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.8938089380893809,
|
|
"grad_norm": 0.23065492082889438,
|
|
"learning_rate": 2.8515569959137668e-06,
|
|
"loss": 1.6919,
|
|
"mean_token_accuracy": 0.6495375871658325,
|
|
"num_tokens": 604125871.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.8951756184228509,
|
|
"grad_norm": 0.3520471246612006,
|
|
"learning_rate": 2.84803438072425e-06,
|
|
"loss": 1.673,
|
|
"mean_token_accuracy": 0.6565649569034576,
|
|
"num_tokens": 605043621.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 0.8965422987563209,
|
|
"grad_norm": 0.166457878258033,
|
|
"learning_rate": 2.844511765534733e-06,
|
|
"loss": 1.6623,
|
|
"mean_token_accuracy": 0.6549607157707215,
|
|
"num_tokens": 605953104.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 0.897908979089791,
|
|
"grad_norm": 0.2781821302609559,
|
|
"learning_rate": 2.8409891503452166e-06,
|
|
"loss": 1.6231,
|
|
"mean_token_accuracy": 0.6615191996097565,
|
|
"num_tokens": 606893771.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.8992756594232609,
|
|
"grad_norm": 0.2501829628601253,
|
|
"learning_rate": 2.8374665351556997e-06,
|
|
"loss": 1.6711,
|
|
"mean_token_accuracy": 0.652370285987854,
|
|
"num_tokens": 607790815.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 0.9006423397567309,
|
|
"grad_norm": 0.22590039211303614,
|
|
"learning_rate": 2.8339439199661832e-06,
|
|
"loss": 1.6506,
|
|
"mean_token_accuracy": 0.6577490925788879,
|
|
"num_tokens": 608759014.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 0.9020090200902009,
|
|
"grad_norm": 0.23421699544379979,
|
|
"learning_rate": 2.8304213047766663e-06,
|
|
"loss": 1.7118,
|
|
"mean_token_accuracy": 0.6460451543331146,
|
|
"num_tokens": 609649798.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 0.9033757004236709,
|
|
"grad_norm": 0.2609741391972759,
|
|
"learning_rate": 2.82689868958715e-06,
|
|
"loss": 1.6305,
|
|
"mean_token_accuracy": 0.6601125955581665,
|
|
"num_tokens": 610585055.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.9047423807571409,
|
|
"grad_norm": 0.19186431460742098,
|
|
"learning_rate": 2.823376074397633e-06,
|
|
"loss": 1.662,
|
|
"mean_token_accuracy": 0.653764146566391,
|
|
"num_tokens": 611507199.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.906109061090611,
|
|
"grad_norm": 0.3443007414592466,
|
|
"learning_rate": 2.819853459208116e-06,
|
|
"loss": 1.629,
|
|
"mean_token_accuracy": 0.6594812929630279,
|
|
"num_tokens": 612393780.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 0.9074757414240809,
|
|
"grad_norm": 0.19824149285817347,
|
|
"learning_rate": 2.8163308440186e-06,
|
|
"loss": 1.7354,
|
|
"mean_token_accuracy": 0.6426892161369324,
|
|
"num_tokens": 613315091.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 0.9088424217575509,
|
|
"grad_norm": 0.22978314227042687,
|
|
"learning_rate": 2.8128082288290832e-06,
|
|
"loss": 1.6523,
|
|
"mean_token_accuracy": 0.6571177780628205,
|
|
"num_tokens": 614223431.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 0.9102091020910209,
|
|
"grad_norm": 0.43877329074444105,
|
|
"learning_rate": 2.8092856136395664e-06,
|
|
"loss": 1.7143,
|
|
"mean_token_accuracy": 0.6457405745983124,
|
|
"num_tokens": 615153232.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.9115757824244909,
|
|
"grad_norm": 0.3175512898214677,
|
|
"learning_rate": 2.8057629984500495e-06,
|
|
"loss": 1.6996,
|
|
"mean_token_accuracy": 0.6440665006637574,
|
|
"num_tokens": 616121094.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 0.9129424627579609,
|
|
"grad_norm": 0.3061243929923493,
|
|
"learning_rate": 2.8022403832605326e-06,
|
|
"loss": 1.6697,
|
|
"mean_token_accuracy": 0.6550917327404022,
|
|
"num_tokens": 617060291.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 0.914309143091431,
|
|
"grad_norm": 0.1816424488333513,
|
|
"learning_rate": 2.7987177680710157e-06,
|
|
"loss": 1.6779,
|
|
"mean_token_accuracy": 0.6551183044910431,
|
|
"num_tokens": 618006136.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 0.915675823424901,
|
|
"grad_norm": 0.5220318480709423,
|
|
"learning_rate": 2.7951951528814997e-06,
|
|
"loss": 1.7005,
|
|
"mean_token_accuracy": 0.6488466441631318,
|
|
"num_tokens": 618954664.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 0.9170425037583709,
|
|
"grad_norm": 0.23662719481371774,
|
|
"learning_rate": 2.791672537691983e-06,
|
|
"loss": 1.6962,
|
|
"mean_token_accuracy": 0.6458711206912995,
|
|
"num_tokens": 619928446.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 0.9184091840918409,
|
|
"grad_norm": 0.2515629802108362,
|
|
"learning_rate": 2.788149922502466e-06,
|
|
"loss": 1.6488,
|
|
"mean_token_accuracy": 0.6590460360050201,
|
|
"num_tokens": 620837291.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 0.9197758644253109,
|
|
"grad_norm": 0.2657828376056659,
|
|
"learning_rate": 2.784627307312949e-06,
|
|
"loss": 1.6296,
|
|
"mean_token_accuracy": 0.6619285464286804,
|
|
"num_tokens": 621786096.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 0.9211425447587809,
|
|
"grad_norm": 0.184504069296063,
|
|
"learning_rate": 2.7811046921234326e-06,
|
|
"loss": 1.6288,
|
|
"mean_token_accuracy": 0.6591895937919616,
|
|
"num_tokens": 622713372.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.922509225092251,
|
|
"grad_norm": 0.1843536466883729,
|
|
"learning_rate": 2.777582076933916e-06,
|
|
"loss": 1.67,
|
|
"mean_token_accuracy": 0.6536752700805664,
|
|
"num_tokens": 623619668.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 0.923875905425721,
|
|
"grad_norm": 0.2588972387899062,
|
|
"learning_rate": 2.7740594617443993e-06,
|
|
"loss": 1.6823,
|
|
"mean_token_accuracy": 0.6512921988964081,
|
|
"num_tokens": 624570491.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 0.9252425857591909,
|
|
"grad_norm": 0.2574371049471701,
|
|
"learning_rate": 2.770536846554883e-06,
|
|
"loss": 1.652,
|
|
"mean_token_accuracy": 0.6576015651226044,
|
|
"num_tokens": 625544212.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 0.9266092660926609,
|
|
"grad_norm": 0.3570109746700755,
|
|
"learning_rate": 2.767014231365366e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6500046253204346,
|
|
"num_tokens": 626460522.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.9279759464261309,
|
|
"grad_norm": 0.17929326679654492,
|
|
"learning_rate": 2.763491616175849e-06,
|
|
"loss": 1.6976,
|
|
"mean_token_accuracy": 0.6503907024860383,
|
|
"num_tokens": 627430843.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.9293426267596009,
|
|
"grad_norm": 0.17034158404946026,
|
|
"learning_rate": 2.759969000986332e-06,
|
|
"loss": 1.6599,
|
|
"mean_token_accuracy": 0.6542510628700257,
|
|
"num_tokens": 628373544.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 0.930709307093071,
|
|
"grad_norm": 0.23826013967417684,
|
|
"learning_rate": 2.756446385796816e-06,
|
|
"loss": 1.6258,
|
|
"mean_token_accuracy": 0.6607022106647491,
|
|
"num_tokens": 629282709.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 0.932075987426541,
|
|
"grad_norm": 0.21697423383407105,
|
|
"learning_rate": 2.7529237706072993e-06,
|
|
"loss": 1.6475,
|
|
"mean_token_accuracy": 0.6556715905666352,
|
|
"num_tokens": 630222309.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 0.933442667760011,
|
|
"grad_norm": 0.20518680601284633,
|
|
"learning_rate": 2.7494011554177824e-06,
|
|
"loss": 1.6147,
|
|
"mean_token_accuracy": 0.664126843214035,
|
|
"num_tokens": 631143499.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 1.71328125,
|
|
"epoch": 0.9348093480934809,
|
|
"grad_norm": 0.28269389266406725,
|
|
"learning_rate": 2.7458785402282656e-06,
|
|
"loss": 1.7231,
|
|
"mean_token_accuracy": 0.6456651270389557,
|
|
"num_tokens": 632059376.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.9361760284269509,
|
|
"grad_norm": 0.1768191715435236,
|
|
"learning_rate": 2.7423559250387487e-06,
|
|
"loss": 1.7158,
|
|
"mean_token_accuracy": 0.6457975745201111,
|
|
"num_tokens": 632985347.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 0.9375427087604209,
|
|
"grad_norm": 0.22616384545525103,
|
|
"learning_rate": 2.7388333098492326e-06,
|
|
"loss": 1.6212,
|
|
"mean_token_accuracy": 0.6622761607170105,
|
|
"num_tokens": 633856793.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 0.938909389093891,
|
|
"grad_norm": 0.18203816080216537,
|
|
"learning_rate": 2.7353106946597158e-06,
|
|
"loss": 1.6843,
|
|
"mean_token_accuracy": 0.6501191020011902,
|
|
"num_tokens": 634824790.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 0.940276069427361,
|
|
"grad_norm": 0.20839409174658605,
|
|
"learning_rate": 2.731788079470199e-06,
|
|
"loss": 1.6879,
|
|
"mean_token_accuracy": 0.6484551906585694,
|
|
"num_tokens": 635705426.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 0.941642749760831,
|
|
"grad_norm": 0.16202741599521514,
|
|
"learning_rate": 2.728265464280682e-06,
|
|
"loss": 1.7012,
|
|
"mean_token_accuracy": 0.6492110908031463,
|
|
"num_tokens": 636605186.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 0.9430094300943009,
|
|
"grad_norm": 0.17978707650451867,
|
|
"learning_rate": 2.7247428490911656e-06,
|
|
"loss": 1.6626,
|
|
"mean_token_accuracy": 0.6545498311519623,
|
|
"num_tokens": 637491028.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 0.9443761104277709,
|
|
"grad_norm": 0.22161703866855942,
|
|
"learning_rate": 2.7212202339016487e-06,
|
|
"loss": 1.6785,
|
|
"mean_token_accuracy": 0.6521097004413605,
|
|
"num_tokens": 638429703.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.9457427907612409,
|
|
"grad_norm": 0.2673077417072573,
|
|
"learning_rate": 2.7176976187121322e-06,
|
|
"loss": 1.6575,
|
|
"mean_token_accuracy": 0.656025105714798,
|
|
"num_tokens": 639419723.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 0.947109471094711,
|
|
"grad_norm": 0.4351567623541186,
|
|
"learning_rate": 2.7141750035226154e-06,
|
|
"loss": 1.6828,
|
|
"mean_token_accuracy": 0.6508717894554138,
|
|
"num_tokens": 640386676.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 0.948476151428181,
|
|
"grad_norm": 0.38099331027364897,
|
|
"learning_rate": 2.710652388333099e-06,
|
|
"loss": 1.6736,
|
|
"mean_token_accuracy": 0.6549591422080994,
|
|
"num_tokens": 641343196.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 0.949842831761651,
|
|
"grad_norm": 0.27406704812082733,
|
|
"learning_rate": 2.707129773143582e-06,
|
|
"loss": 1.7277,
|
|
"mean_token_accuracy": 0.6429347097873688,
|
|
"num_tokens": 642301044.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 0.951209512095121,
|
|
"grad_norm": 0.25213548331229,
|
|
"learning_rate": 2.703607157954065e-06,
|
|
"loss": 1.6543,
|
|
"mean_token_accuracy": 0.6556038618087768,
|
|
"num_tokens": 643236365.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 0.9525761924285909,
|
|
"grad_norm": 0.17360309604613172,
|
|
"learning_rate": 2.7000845427645483e-06,
|
|
"loss": 1.698,
|
|
"mean_token_accuracy": 0.6481959819793701,
|
|
"num_tokens": 644125291.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 0.9539428727620609,
|
|
"grad_norm": 0.2058665199315186,
|
|
"learning_rate": 2.6965619275750322e-06,
|
|
"loss": 1.5947,
|
|
"mean_token_accuracy": 0.6636456370353698,
|
|
"num_tokens": 645029995.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 1.7546875,
|
|
"epoch": 0.955309553095531,
|
|
"grad_norm": 0.3521624890366951,
|
|
"learning_rate": 2.6930393123855154e-06,
|
|
"loss": 1.7786,
|
|
"mean_token_accuracy": 0.6389911293983459,
|
|
"num_tokens": 645970980.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 0.956676233429001,
|
|
"grad_norm": 0.23875199007120596,
|
|
"learning_rate": 2.6895166971959985e-06,
|
|
"loss": 1.7493,
|
|
"mean_token_accuracy": 0.6418570280075073,
|
|
"num_tokens": 646871402.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.958042913762471,
|
|
"grad_norm": 0.18391255666505105,
|
|
"learning_rate": 2.6859940820064816e-06,
|
|
"loss": 1.6283,
|
|
"mean_token_accuracy": 0.6613797962665557,
|
|
"num_tokens": 647778696.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.959409594095941,
|
|
"grad_norm": 0.20316009592424789,
|
|
"learning_rate": 2.6824714668169647e-06,
|
|
"loss": 1.6644,
|
|
"mean_token_accuracy": 0.6538566768169403,
|
|
"num_tokens": 648694055.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 0.9607762744294109,
|
|
"grad_norm": 0.3081937518458796,
|
|
"learning_rate": 2.6789488516274487e-06,
|
|
"loss": 1.7051,
|
|
"mean_token_accuracy": 0.6477914988994599,
|
|
"num_tokens": 649689482.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 0.9621429547628809,
|
|
"grad_norm": 0.2412672279785936,
|
|
"learning_rate": 2.675426236437932e-06,
|
|
"loss": 1.6664,
|
|
"mean_token_accuracy": 0.6523195147514343,
|
|
"num_tokens": 650654616.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 0.963509635096351,
|
|
"grad_norm": 0.2993353774839712,
|
|
"learning_rate": 2.671903621248415e-06,
|
|
"loss": 1.6449,
|
|
"mean_token_accuracy": 0.6545247375965119,
|
|
"num_tokens": 651547118.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.964876315429821,
|
|
"grad_norm": 0.3040355022581922,
|
|
"learning_rate": 2.668381006058898e-06,
|
|
"loss": 1.6786,
|
|
"mean_token_accuracy": 0.6514400660991668,
|
|
"num_tokens": 652444577.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 0.966242995763291,
|
|
"grad_norm": 0.3044498329196681,
|
|
"learning_rate": 2.6648583908693816e-06,
|
|
"loss": 1.6977,
|
|
"mean_token_accuracy": 0.6493261396884918,
|
|
"num_tokens": 653366870.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 0.967609676096761,
|
|
"grad_norm": 0.1925046568950103,
|
|
"learning_rate": 2.6613357756798648e-06,
|
|
"loss": 1.7393,
|
|
"mean_token_accuracy": 0.6442283689975739,
|
|
"num_tokens": 654286956.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 0.968976356430231,
|
|
"grad_norm": 0.217696984693649,
|
|
"learning_rate": 2.6578131604903483e-06,
|
|
"loss": 1.6559,
|
|
"mean_token_accuracy": 0.6557366847991943,
|
|
"num_tokens": 655174139.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 0.9703430367637009,
|
|
"grad_norm": 0.4120650821488678,
|
|
"learning_rate": 2.6542905453008314e-06,
|
|
"loss": 1.6685,
|
|
"mean_token_accuracy": 0.6537732362747193,
|
|
"num_tokens": 656097004.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 1.696875,
|
|
"epoch": 0.971709717097171,
|
|
"grad_norm": 0.23382138588382592,
|
|
"learning_rate": 2.650767930111315e-06,
|
|
"loss": 1.685,
|
|
"mean_token_accuracy": 0.6533159732818603,
|
|
"num_tokens": 657029359.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 0.973076397430641,
|
|
"grad_norm": 0.19272535237379368,
|
|
"learning_rate": 2.647245314921798e-06,
|
|
"loss": 1.6945,
|
|
"mean_token_accuracy": 0.6505637109279633,
|
|
"num_tokens": 657989480.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 0.974443077764111,
|
|
"grad_norm": 0.19605939860927624,
|
|
"learning_rate": 2.6437226997322812e-06,
|
|
"loss": 1.6743,
|
|
"mean_token_accuracy": 0.6528466820716858,
|
|
"num_tokens": 658962059.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 0.975809758097581,
|
|
"grad_norm": 0.22171062627726018,
|
|
"learning_rate": 2.640200084542765e-06,
|
|
"loss": 1.7008,
|
|
"mean_token_accuracy": 0.6480770289897919,
|
|
"num_tokens": 659898843.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 0.977176438431051,
|
|
"grad_norm": 0.2220034311847891,
|
|
"learning_rate": 2.6366774693532483e-06,
|
|
"loss": 1.6576,
|
|
"mean_token_accuracy": 0.6555830240249634,
|
|
"num_tokens": 660760246.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 0.9785431187645209,
|
|
"grad_norm": 0.30984332031862144,
|
|
"learning_rate": 2.6331548541637314e-06,
|
|
"loss": 1.7331,
|
|
"mean_token_accuracy": 0.6434890925884247,
|
|
"num_tokens": 661678111.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 0.979909799097991,
|
|
"grad_norm": 0.38262661438217765,
|
|
"learning_rate": 2.6296322389742146e-06,
|
|
"loss": 1.6336,
|
|
"mean_token_accuracy": 0.6589345097541809,
|
|
"num_tokens": 662613458.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 0.981276479431461,
|
|
"grad_norm": 0.2761293000913367,
|
|
"learning_rate": 2.6261096237846977e-06,
|
|
"loss": 1.6201,
|
|
"mean_token_accuracy": 0.6623457074165344,
|
|
"num_tokens": 663524576.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 0.982643159764931,
|
|
"grad_norm": 0.19780728773964268,
|
|
"learning_rate": 2.622587008595181e-06,
|
|
"loss": 1.6401,
|
|
"mean_token_accuracy": 0.6570155620574951,
|
|
"num_tokens": 664426384.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 1.703125,
|
|
"epoch": 0.984009840098401,
|
|
"grad_norm": 0.29573491069114366,
|
|
"learning_rate": 2.6190643934056648e-06,
|
|
"loss": 1.7108,
|
|
"mean_token_accuracy": 0.6461049258708954,
|
|
"num_tokens": 665358458.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 0.985376520431871,
|
|
"grad_norm": 0.22943269082715864,
|
|
"learning_rate": 2.615541778216148e-06,
|
|
"loss": 1.7119,
|
|
"mean_token_accuracy": 0.6465909898281097,
|
|
"num_tokens": 666261171.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 0.9867432007653409,
|
|
"grad_norm": 0.29451852317574106,
|
|
"learning_rate": 2.612019163026631e-06,
|
|
"loss": 1.6699,
|
|
"mean_token_accuracy": 0.6548290312290191,
|
|
"num_tokens": 667166522.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 0.988109881098811,
|
|
"grad_norm": 0.23046675611933445,
|
|
"learning_rate": 2.6084965478371146e-06,
|
|
"loss": 1.7344,
|
|
"mean_token_accuracy": 0.6430014848709107,
|
|
"num_tokens": 668094023.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 0.989476561432281,
|
|
"grad_norm": 0.18076403553922385,
|
|
"learning_rate": 2.6049739326475977e-06,
|
|
"loss": 1.6177,
|
|
"mean_token_accuracy": 0.6608222663402558,
|
|
"num_tokens": 669024537.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 0.990843241765751,
|
|
"grad_norm": 1.1237875998431341,
|
|
"learning_rate": 2.6014513174580812e-06,
|
|
"loss": 1.7148,
|
|
"mean_token_accuracy": 0.6459267556667327,
|
|
"num_tokens": 669920244.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 0.992209922099221,
|
|
"grad_norm": 0.18897893095893528,
|
|
"learning_rate": 2.5979287022685644e-06,
|
|
"loss": 1.6335,
|
|
"mean_token_accuracy": 0.6597852528095245,
|
|
"num_tokens": 670821760.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 0.993576602432691,
|
|
"grad_norm": 0.27260618026214073,
|
|
"learning_rate": 2.594406087079048e-06,
|
|
"loss": 1.6065,
|
|
"mean_token_accuracy": 0.6618834555149078,
|
|
"num_tokens": 671732949.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 0.994943282766161,
|
|
"grad_norm": 0.5104697516456056,
|
|
"learning_rate": 2.590883471889531e-06,
|
|
"loss": 1.7251,
|
|
"mean_token_accuracy": 0.644684088230133,
|
|
"num_tokens": 672642770.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 0.996309963099631,
|
|
"grad_norm": 0.4439888456980909,
|
|
"learning_rate": 2.587360856700014e-06,
|
|
"loss": 1.6303,
|
|
"mean_token_accuracy": 0.6582875549793243,
|
|
"num_tokens": 673545630.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 0.997676643433101,
|
|
"grad_norm": 0.29063700307444046,
|
|
"learning_rate": 2.5838382415104973e-06,
|
|
"loss": 1.6668,
|
|
"mean_token_accuracy": 0.6535208940505981,
|
|
"num_tokens": 674462480.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 0.999043323766571,
|
|
"grad_norm": 0.2526617863534266,
|
|
"learning_rate": 2.5803156263209813e-06,
|
|
"loss": 1.6924,
|
|
"mean_token_accuracy": 0.6464374542236329,
|
|
"num_tokens": 675361911.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 1.7265625,
|
|
"epoch": 1.000410004100041,
|
|
"grad_norm": 0.1965569246375378,
|
|
"learning_rate": 2.5767930111314644e-06,
|
|
"loss": 1.732,
|
|
"mean_token_accuracy": 0.6419793248176575,
|
|
"num_tokens": 676316566.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.001776684433511,
|
|
"grad_norm": 0.35596841435570603,
|
|
"learning_rate": 2.5732703959419475e-06,
|
|
"loss": 1.6452,
|
|
"mean_token_accuracy": 0.6560405671596528,
|
|
"num_tokens": 677218311.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 1.003143364766981,
|
|
"grad_norm": 0.20141804080844047,
|
|
"learning_rate": 2.5697477807524306e-06,
|
|
"loss": 1.6863,
|
|
"mean_token_accuracy": 0.6485451638698578,
|
|
"num_tokens": 678197771.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.004510045100451,
|
|
"grad_norm": 0.1691285433634823,
|
|
"learning_rate": 2.5662251655629138e-06,
|
|
"loss": 1.6239,
|
|
"mean_token_accuracy": 0.660239064693451,
|
|
"num_tokens": 679125795.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 1.73203125,
|
|
"epoch": 1.005876725433921,
|
|
"grad_norm": 0.23463217801492875,
|
|
"learning_rate": 2.5627025503733977e-06,
|
|
"loss": 1.7444,
|
|
"mean_token_accuracy": 0.6421261072158814,
|
|
"num_tokens": 680052452.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.007243405767391,
|
|
"grad_norm": 0.18403119799809067,
|
|
"learning_rate": 2.559179935183881e-06,
|
|
"loss": 1.6862,
|
|
"mean_token_accuracy": 0.6517752051353455,
|
|
"num_tokens": 680960579.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 1.7203125,
|
|
"epoch": 1.0086100861008611,
|
|
"grad_norm": 0.22738921295724818,
|
|
"learning_rate": 2.555657319994364e-06,
|
|
"loss": 1.7381,
|
|
"mean_token_accuracy": 0.6462263464927673,
|
|
"num_tokens": 681880872.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 1.684375,
|
|
"epoch": 1.009976766434331,
|
|
"grad_norm": 0.2960086146067728,
|
|
"learning_rate": 2.552134704804847e-06,
|
|
"loss": 1.6967,
|
|
"mean_token_accuracy": 0.6460098385810852,
|
|
"num_tokens": 682835224.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 1.740625,
|
|
"epoch": 1.011343446767801,
|
|
"grad_norm": 0.22758261878202177,
|
|
"learning_rate": 2.5486120896153306e-06,
|
|
"loss": 1.768,
|
|
"mean_token_accuracy": 0.6361886382102966,
|
|
"num_tokens": 683737488.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.012710127101271,
|
|
"grad_norm": 0.2412313331532355,
|
|
"learning_rate": 2.5450894744258138e-06,
|
|
"loss": 1.6652,
|
|
"mean_token_accuracy": 0.6545486807823181,
|
|
"num_tokens": 684660891.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.014076807434741,
|
|
"grad_norm": 0.21609302003336925,
|
|
"learning_rate": 2.5415668592362973e-06,
|
|
"loss": 1.6288,
|
|
"mean_token_accuracy": 0.6614020884037017,
|
|
"num_tokens": 685544792.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.015443487768211,
|
|
"grad_norm": 0.17405968131328678,
|
|
"learning_rate": 2.5380442440467804e-06,
|
|
"loss": 1.6425,
|
|
"mean_token_accuracy": 0.6609974086284638,
|
|
"num_tokens": 686490534.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.016810168101681,
|
|
"grad_norm": 0.22338974299455241,
|
|
"learning_rate": 2.534521628857264e-06,
|
|
"loss": 1.6951,
|
|
"mean_token_accuracy": 0.6482141554355622,
|
|
"num_tokens": 687406392.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.018176848435151,
|
|
"grad_norm": 0.22218641740941839,
|
|
"learning_rate": 2.530999013667747e-06,
|
|
"loss": 1.6535,
|
|
"mean_token_accuracy": 0.6567085564136506,
|
|
"num_tokens": 688305418.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 1.78125,
|
|
"epoch": 1.019543528768621,
|
|
"grad_norm": 0.17192502981861707,
|
|
"learning_rate": 2.5274763984782302e-06,
|
|
"loss": 1.7848,
|
|
"mean_token_accuracy": 0.6379982829093933,
|
|
"num_tokens": 689231036.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.020910209102091,
|
|
"grad_norm": 0.2912463586466522,
|
|
"learning_rate": 2.5239537832887138e-06,
|
|
"loss": 1.635,
|
|
"mean_token_accuracy": 0.6597213029861451,
|
|
"num_tokens": 690189490.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.022276889435561,
|
|
"grad_norm": 0.28276016663929016,
|
|
"learning_rate": 2.5204311680991973e-06,
|
|
"loss": 1.6724,
|
|
"mean_token_accuracy": 0.6505327343940734,
|
|
"num_tokens": 691119666.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.023643569769031,
|
|
"grad_norm": 0.269462640799337,
|
|
"learning_rate": 2.5169085529096804e-06,
|
|
"loss": 1.6138,
|
|
"mean_token_accuracy": 0.6623372316360474,
|
|
"num_tokens": 691976922.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 1.0250102501025011,
|
|
"grad_norm": 0.2597246878519548,
|
|
"learning_rate": 2.5133859377201636e-06,
|
|
"loss": 1.7274,
|
|
"mean_token_accuracy": 0.6404986441135406,
|
|
"num_tokens": 692888020.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 1.721875,
|
|
"epoch": 1.0263769304359711,
|
|
"grad_norm": 0.21955954267310826,
|
|
"learning_rate": 2.5098633225306467e-06,
|
|
"loss": 1.7279,
|
|
"mean_token_accuracy": 0.6438687086105347,
|
|
"num_tokens": 693820737.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.027743610769441,
|
|
"grad_norm": 0.21984231174833332,
|
|
"learning_rate": 2.50634070734113e-06,
|
|
"loss": 1.6538,
|
|
"mean_token_accuracy": 0.6533161461353302,
|
|
"num_tokens": 694756007.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.029110291102911,
|
|
"grad_norm": 0.23868062914946833,
|
|
"learning_rate": 2.502818092151614e-06,
|
|
"loss": 1.6762,
|
|
"mean_token_accuracy": 0.6532954335212707,
|
|
"num_tokens": 695661989.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.030476971436381,
|
|
"grad_norm": 0.23543679146573745,
|
|
"learning_rate": 2.499295476962097e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.6535726547241211,
|
|
"num_tokens": 696589885.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 1.031843651769851,
|
|
"grad_norm": 0.3369639452739896,
|
|
"learning_rate": 2.49577286177258e-06,
|
|
"loss": 1.6974,
|
|
"mean_token_accuracy": 0.6523191213607789,
|
|
"num_tokens": 697543993.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 1.033210332103321,
|
|
"grad_norm": 1.4148632582695682,
|
|
"learning_rate": 2.492250246583063e-06,
|
|
"loss": 1.7082,
|
|
"mean_token_accuracy": 0.6484336793422699,
|
|
"num_tokens": 698454919.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.034577012436791,
|
|
"grad_norm": 0.2082838698170078,
|
|
"learning_rate": 2.4887276313935467e-06,
|
|
"loss": 1.6634,
|
|
"mean_token_accuracy": 0.655291622877121,
|
|
"num_tokens": 699361699.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.035943692770261,
|
|
"grad_norm": 0.2514044460557142,
|
|
"learning_rate": 2.48520501620403e-06,
|
|
"loss": 1.6586,
|
|
"mean_token_accuracy": 0.653133875131607,
|
|
"num_tokens": 700300161.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.037310373103731,
|
|
"grad_norm": 0.3181356580291977,
|
|
"learning_rate": 2.4816824010145134e-06,
|
|
"loss": 1.6924,
|
|
"mean_token_accuracy": 0.6504492163658142,
|
|
"num_tokens": 701203486.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.038677053437201,
|
|
"grad_norm": 0.1946638754689997,
|
|
"learning_rate": 2.478159785824997e-06,
|
|
"loss": 1.6703,
|
|
"mean_token_accuracy": 0.6520937800407409,
|
|
"num_tokens": 702184729.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.040043733770671,
|
|
"grad_norm": 0.27086311595405127,
|
|
"learning_rate": 2.47463717063548e-06,
|
|
"loss": 1.6774,
|
|
"mean_token_accuracy": 0.6525812208652496,
|
|
"num_tokens": 703115405.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 1.0414104141041411,
|
|
"grad_norm": 0.42838974966856974,
|
|
"learning_rate": 2.4711145554459636e-06,
|
|
"loss": 1.7466,
|
|
"mean_token_accuracy": 0.6410577654838562,
|
|
"num_tokens": 704066947.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 1.72265625,
|
|
"epoch": 1.0427770944376111,
|
|
"grad_norm": 0.22043381719645924,
|
|
"learning_rate": 2.4675919402564467e-06,
|
|
"loss": 1.7196,
|
|
"mean_token_accuracy": 0.6461298108100891,
|
|
"num_tokens": 704986582.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.0441437747710811,
|
|
"grad_norm": 0.22321117106578803,
|
|
"learning_rate": 2.46406932506693e-06,
|
|
"loss": 1.687,
|
|
"mean_token_accuracy": 0.6498645365238189,
|
|
"num_tokens": 705943054.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 1.045510455104551,
|
|
"grad_norm": 0.20876410436198015,
|
|
"learning_rate": 2.4605467098774134e-06,
|
|
"loss": 1.7102,
|
|
"mean_token_accuracy": 0.6474853157997131,
|
|
"num_tokens": 706905798.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.046877135438021,
|
|
"grad_norm": 0.17731888714348779,
|
|
"learning_rate": 2.4570240946878965e-06,
|
|
"loss": 1.6666,
|
|
"mean_token_accuracy": 0.6538927733898163,
|
|
"num_tokens": 707820496.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.048243815771491,
|
|
"grad_norm": 0.2161538130252074,
|
|
"learning_rate": 2.4535014794983796e-06,
|
|
"loss": 1.689,
|
|
"mean_token_accuracy": 0.6518913984298706,
|
|
"num_tokens": 708741330.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.049610496104961,
|
|
"grad_norm": 0.19824616572166354,
|
|
"learning_rate": 2.449978864308863e-06,
|
|
"loss": 1.6231,
|
|
"mean_token_accuracy": 0.6603968918323517,
|
|
"num_tokens": 709623662.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 1.050977176438431,
|
|
"grad_norm": 0.1917124976069378,
|
|
"learning_rate": 2.4464562491193463e-06,
|
|
"loss": 1.6628,
|
|
"mean_token_accuracy": 0.6550075709819794,
|
|
"num_tokens": 710513988.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 1.052343856771901,
|
|
"grad_norm": 0.21680044893662154,
|
|
"learning_rate": 2.4429336339298294e-06,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.6478516221046448,
|
|
"num_tokens": 711462124.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.053710537105371,
|
|
"grad_norm": 0.20085520229134174,
|
|
"learning_rate": 2.439411018740313e-06,
|
|
"loss": 1.5915,
|
|
"mean_token_accuracy": 0.6619880139827728,
|
|
"num_tokens": 712342813.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.055077217438841,
|
|
"grad_norm": 0.2764126041873593,
|
|
"learning_rate": 2.435888403550796e-06,
|
|
"loss": 1.659,
|
|
"mean_token_accuracy": 0.654501485824585,
|
|
"num_tokens": 713243182.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 1.056443897772311,
|
|
"grad_norm": 0.2218272333558382,
|
|
"learning_rate": 2.4323657883612797e-06,
|
|
"loss": 1.6821,
|
|
"mean_token_accuracy": 0.6511309921741486,
|
|
"num_tokens": 714172123.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 1.0578105781057812,
|
|
"grad_norm": 0.2469322423206653,
|
|
"learning_rate": 2.4288431731717628e-06,
|
|
"loss": 1.6796,
|
|
"mean_token_accuracy": 0.6510632514953614,
|
|
"num_tokens": 715059084.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.0591772584392511,
|
|
"grad_norm": 0.3332803587488752,
|
|
"learning_rate": 2.4253205579822463e-06,
|
|
"loss": 1.686,
|
|
"mean_token_accuracy": 0.6501067399978637,
|
|
"num_tokens": 715954395.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.0605439387727211,
|
|
"grad_norm": 0.2389270673715273,
|
|
"learning_rate": 2.4217979427927294e-06,
|
|
"loss": 1.6088,
|
|
"mean_token_accuracy": 0.6623116135597229,
|
|
"num_tokens": 716853934.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.0619106191061911,
|
|
"grad_norm": 0.20443418308694333,
|
|
"learning_rate": 2.418275327603213e-06,
|
|
"loss": 1.6112,
|
|
"mean_token_accuracy": 0.6614984273910522,
|
|
"num_tokens": 717756851.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.063277299439661,
|
|
"grad_norm": 0.3060930650237438,
|
|
"learning_rate": 2.414752712413696e-06,
|
|
"loss": 1.6462,
|
|
"mean_token_accuracy": 0.6573056221008301,
|
|
"num_tokens": 718678398.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.064643979773131,
|
|
"grad_norm": 0.16861485470718213,
|
|
"learning_rate": 2.4112300972241797e-06,
|
|
"loss": 1.5859,
|
|
"mean_token_accuracy": 0.6666079938411713,
|
|
"num_tokens": 719593304.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.066010660106601,
|
|
"grad_norm": 0.3196424718613158,
|
|
"learning_rate": 2.4077074820346628e-06,
|
|
"loss": 1.6838,
|
|
"mean_token_accuracy": 0.6512435138225555,
|
|
"num_tokens": 720518644.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 1.7125,
|
|
"epoch": 1.067377340440071,
|
|
"grad_norm": 0.18841932480384777,
|
|
"learning_rate": 2.404184866845146e-06,
|
|
"loss": 1.7165,
|
|
"mean_token_accuracy": 0.6448841989040375,
|
|
"num_tokens": 721466173.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 1.068744020773541,
|
|
"grad_norm": 0.22920383234557604,
|
|
"learning_rate": 2.4006622516556295e-06,
|
|
"loss": 1.6717,
|
|
"mean_token_accuracy": 0.650806188583374,
|
|
"num_tokens": 722391378.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 1.71171875,
|
|
"epoch": 1.070110701107011,
|
|
"grad_norm": 0.168284673843641,
|
|
"learning_rate": 2.3971396364661126e-06,
|
|
"loss": 1.7192,
|
|
"mean_token_accuracy": 0.6480787336826325,
|
|
"num_tokens": 723301043.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.071477381440481,
|
|
"grad_norm": 0.22840219011697335,
|
|
"learning_rate": 2.393617021276596e-06,
|
|
"loss": 1.6877,
|
|
"mean_token_accuracy": 0.6502880871295929,
|
|
"num_tokens": 724210047.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.072844061773951,
|
|
"grad_norm": 0.21677982811625912,
|
|
"learning_rate": 2.3900944060870793e-06,
|
|
"loss": 1.6424,
|
|
"mean_token_accuracy": 0.6565849661827088,
|
|
"num_tokens": 725101731.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 1.66796875,
|
|
"epoch": 1.0742107421074212,
|
|
"grad_norm": 0.28432053081735237,
|
|
"learning_rate": 2.3865717908975624e-06,
|
|
"loss": 1.6826,
|
|
"mean_token_accuracy": 0.6491518616676331,
|
|
"num_tokens": 726033630.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 1.0755774224408912,
|
|
"grad_norm": 0.22906189326606785,
|
|
"learning_rate": 2.383049175708046e-06,
|
|
"loss": 1.7121,
|
|
"mean_token_accuracy": 0.6473718285560608,
|
|
"num_tokens": 726954469.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.0769441027743611,
|
|
"grad_norm": 0.17557645698567032,
|
|
"learning_rate": 2.379526560518529e-06,
|
|
"loss": 1.667,
|
|
"mean_token_accuracy": 0.6519730567932129,
|
|
"num_tokens": 727863704.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 1.0783107831078311,
|
|
"grad_norm": 0.19323307033731357,
|
|
"learning_rate": 2.376003945329012e-06,
|
|
"loss": 1.7039,
|
|
"mean_token_accuracy": 0.6455708682537079,
|
|
"num_tokens": 728804746.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.079677463441301,
|
|
"grad_norm": 0.24130576135940748,
|
|
"learning_rate": 2.3724813301394957e-06,
|
|
"loss": 1.558,
|
|
"mean_token_accuracy": 0.6707373559474945,
|
|
"num_tokens": 729699805.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.081044143774771,
|
|
"grad_norm": 0.14505151924613072,
|
|
"learning_rate": 2.368958714949979e-06,
|
|
"loss": 1.6075,
|
|
"mean_token_accuracy": 0.6632670342922211,
|
|
"num_tokens": 730589858.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.082410824108241,
|
|
"grad_norm": 0.26543306371752795,
|
|
"learning_rate": 2.3654360997604624e-06,
|
|
"loss": 1.621,
|
|
"mean_token_accuracy": 0.6592788696289062,
|
|
"num_tokens": 731498399.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 1.083777504441711,
|
|
"grad_norm": 0.20731546975818027,
|
|
"learning_rate": 2.3619134845709455e-06,
|
|
"loss": 1.6697,
|
|
"mean_token_accuracy": 0.6546447277069092,
|
|
"num_tokens": 732449767.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 1.085144184775181,
|
|
"grad_norm": 0.18556198441921778,
|
|
"learning_rate": 2.358390869381429e-06,
|
|
"loss": 1.6734,
|
|
"mean_token_accuracy": 0.653580516576767,
|
|
"num_tokens": 733365001.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 1.086510865108651,
|
|
"grad_norm": 0.20299357466168064,
|
|
"learning_rate": 2.354868254191912e-06,
|
|
"loss": 1.6933,
|
|
"mean_token_accuracy": 0.6488501012325287,
|
|
"num_tokens": 734314272.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.087877545442121,
|
|
"grad_norm": 0.17473263670784633,
|
|
"learning_rate": 2.3513456390023957e-06,
|
|
"loss": 1.656,
|
|
"mean_token_accuracy": 0.6548760414123536,
|
|
"num_tokens": 735251086.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.089244225775591,
|
|
"grad_norm": 0.18598123037278072,
|
|
"learning_rate": 2.347823023812879e-06,
|
|
"loss": 1.6897,
|
|
"mean_token_accuracy": 0.6491748690605164,
|
|
"num_tokens": 736144885.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.090610906109061,
|
|
"grad_norm": 0.1917464582741839,
|
|
"learning_rate": 2.3443004086233624e-06,
|
|
"loss": 1.678,
|
|
"mean_token_accuracy": 0.6508231341838837,
|
|
"num_tokens": 737055770.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.0919775864425312,
|
|
"grad_norm": 0.20183071121979948,
|
|
"learning_rate": 2.3407777934338455e-06,
|
|
"loss": 1.6454,
|
|
"mean_token_accuracy": 0.6609357476234436,
|
|
"num_tokens": 737995940.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.0933442667760012,
|
|
"grad_norm": 0.1851209854426365,
|
|
"learning_rate": 2.3372551782443286e-06,
|
|
"loss": 1.6296,
|
|
"mean_token_accuracy": 0.659592866897583,
|
|
"num_tokens": 738909614.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 1.69921875,
|
|
"epoch": 1.0947109471094711,
|
|
"grad_norm": 0.20725336514617118,
|
|
"learning_rate": 2.333732563054812e-06,
|
|
"loss": 1.7093,
|
|
"mean_token_accuracy": 0.6485725164413452,
|
|
"num_tokens": 739855550.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.0960776274429411,
|
|
"grad_norm": 0.27998292605575964,
|
|
"learning_rate": 2.3302099478652953e-06,
|
|
"loss": 1.6859,
|
|
"mean_token_accuracy": 0.6524621307849884,
|
|
"num_tokens": 740788357.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.097444307776411,
|
|
"grad_norm": 0.20787274512041234,
|
|
"learning_rate": 2.3266873326757784e-06,
|
|
"loss": 1.6809,
|
|
"mean_token_accuracy": 0.6545500874519348,
|
|
"num_tokens": 741680384.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 1.7765625,
|
|
"epoch": 1.098810988109881,
|
|
"grad_norm": 0.1697199645780868,
|
|
"learning_rate": 2.323164717486262e-06,
|
|
"loss": 1.7883,
|
|
"mean_token_accuracy": 0.6322856843471527,
|
|
"num_tokens": 742618115.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.100177668443351,
|
|
"grad_norm": 0.18635608908220488,
|
|
"learning_rate": 2.319642102296745e-06,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6643556237220765,
|
|
"num_tokens": 743543867.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 1.101544348776821,
|
|
"grad_norm": 0.3117879616174589,
|
|
"learning_rate": 2.3161194871072287e-06,
|
|
"loss": 1.6781,
|
|
"mean_token_accuracy": 0.6508804976940155,
|
|
"num_tokens": 744457040.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 1.609375,
|
|
"epoch": 1.102911029110291,
|
|
"grad_norm": 0.23305751920756085,
|
|
"learning_rate": 2.3125968719177118e-06,
|
|
"loss": 1.6023,
|
|
"mean_token_accuracy": 0.6643706560134888,
|
|
"num_tokens": 745361796.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 1.75,
|
|
"epoch": 1.104277709443761,
|
|
"grad_norm": 0.19445472787244794,
|
|
"learning_rate": 2.3090742567281953e-06,
|
|
"loss": 1.7527,
|
|
"mean_token_accuracy": 0.6394903421401977,
|
|
"num_tokens": 746310432.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.105644389777231,
|
|
"grad_norm": 0.1422191937808774,
|
|
"learning_rate": 2.3055516415386785e-06,
|
|
"loss": 1.6421,
|
|
"mean_token_accuracy": 0.6584798157215118,
|
|
"num_tokens": 747221455.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.1070110701107012,
|
|
"grad_norm": 0.20532181072083155,
|
|
"learning_rate": 2.302029026349162e-06,
|
|
"loss": 1.653,
|
|
"mean_token_accuracy": 0.6576489210128784,
|
|
"num_tokens": 748130753.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 1.1083777504441712,
|
|
"grad_norm": 0.28876183997524557,
|
|
"learning_rate": 2.298506411159645e-06,
|
|
"loss": 1.6874,
|
|
"mean_token_accuracy": 0.6507222056388855,
|
|
"num_tokens": 749073493.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 1.1097444307776412,
|
|
"grad_norm": 0.2075025852641672,
|
|
"learning_rate": 2.2949837959701287e-06,
|
|
"loss": 1.6808,
|
|
"mean_token_accuracy": 0.6488942205905914,
|
|
"num_tokens": 750060106.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 1.1111111111111112,
|
|
"grad_norm": 0.24088076299146782,
|
|
"learning_rate": 2.291461180780612e-06,
|
|
"loss": 1.6826,
|
|
"mean_token_accuracy": 0.6519817769527435,
|
|
"num_tokens": 751013743.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.1124777914445811,
|
|
"grad_norm": 0.2416340038571547,
|
|
"learning_rate": 2.287938565591095e-06,
|
|
"loss": 1.642,
|
|
"mean_token_accuracy": 0.6572687566280365,
|
|
"num_tokens": 751945625.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.1138444717780511,
|
|
"grad_norm": 0.21276526441758223,
|
|
"learning_rate": 2.2844159504015785e-06,
|
|
"loss": 1.6272,
|
|
"mean_token_accuracy": 0.6625459015369415,
|
|
"num_tokens": 752899458.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 1.115211152111521,
|
|
"grad_norm": 0.18879067453387563,
|
|
"learning_rate": 2.2808933352120616e-06,
|
|
"loss": 1.6738,
|
|
"mean_token_accuracy": 0.6553272724151611,
|
|
"num_tokens": 753770218.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 1.6828125,
|
|
"epoch": 1.116577832444991,
|
|
"grad_norm": 0.14220025944197257,
|
|
"learning_rate": 2.277370720022545e-06,
|
|
"loss": 1.6972,
|
|
"mean_token_accuracy": 0.648533284664154,
|
|
"num_tokens": 754683741.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.117944512778461,
|
|
"grad_norm": 0.2777449921760386,
|
|
"learning_rate": 2.2738481048330283e-06,
|
|
"loss": 1.6618,
|
|
"mean_token_accuracy": 0.6526118993759156,
|
|
"num_tokens": 755630471.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.119311193111931,
|
|
"grad_norm": 0.2116137875386654,
|
|
"learning_rate": 2.2703254896435114e-06,
|
|
"loss": 1.647,
|
|
"mean_token_accuracy": 0.6555546879768371,
|
|
"num_tokens": 756584680.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 1.709375,
|
|
"epoch": 1.120677873445401,
|
|
"grad_norm": 0.23351611367365874,
|
|
"learning_rate": 2.266802874453995e-06,
|
|
"loss": 1.718,
|
|
"mean_token_accuracy": 0.6446906685829162,
|
|
"num_tokens": 757525980.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.122044553778871,
|
|
"grad_norm": 0.20218442559358565,
|
|
"learning_rate": 2.263280259264478e-06,
|
|
"loss": 1.6668,
|
|
"mean_token_accuracy": 0.654239171743393,
|
|
"num_tokens": 758433541.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.123411234112341,
|
|
"grad_norm": 0.18209344198816393,
|
|
"learning_rate": 2.259757644074961e-06,
|
|
"loss": 1.5999,
|
|
"mean_token_accuracy": 0.6635931134223938,
|
|
"num_tokens": 759342432.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 1.1247779144458112,
|
|
"grad_norm": 0.33837929499427666,
|
|
"learning_rate": 2.2562350288854447e-06,
|
|
"loss": 1.6619,
|
|
"mean_token_accuracy": 0.6561294138431549,
|
|
"num_tokens": 760272552.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.1261445947792812,
|
|
"grad_norm": 0.27022809865812175,
|
|
"learning_rate": 2.252712413695928e-06,
|
|
"loss": 1.6216,
|
|
"mean_token_accuracy": 0.6592092037200927,
|
|
"num_tokens": 761157949.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 1.1275112751127512,
|
|
"grad_norm": 0.19194089456502092,
|
|
"learning_rate": 2.2491897985064114e-06,
|
|
"loss": 1.6958,
|
|
"mean_token_accuracy": 0.6482615351676941,
|
|
"num_tokens": 762099636.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.1288779554462212,
|
|
"grad_norm": 0.172542356624241,
|
|
"learning_rate": 2.2456671833168945e-06,
|
|
"loss": 1.6358,
|
|
"mean_token_accuracy": 0.6597086310386657,
|
|
"num_tokens": 762988514.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.1302446357796911,
|
|
"grad_norm": 0.23500343791554418,
|
|
"learning_rate": 2.242144568127378e-06,
|
|
"loss": 1.6563,
|
|
"mean_token_accuracy": 0.6555641949176788,
|
|
"num_tokens": 763880385.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 1.1316113161131611,
|
|
"grad_norm": 0.17163830194392246,
|
|
"learning_rate": 2.238621952937861e-06,
|
|
"loss": 1.719,
|
|
"mean_token_accuracy": 0.6463564693927765,
|
|
"num_tokens": 764834395.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 1.7359375,
|
|
"epoch": 1.132977996446631,
|
|
"grad_norm": 0.2866603533994239,
|
|
"learning_rate": 2.2350993377483447e-06,
|
|
"loss": 1.7212,
|
|
"mean_token_accuracy": 0.6467340230941773,
|
|
"num_tokens": 765749148.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.134344676780101,
|
|
"grad_norm": 0.28938617291374374,
|
|
"learning_rate": 2.231576722558828e-06,
|
|
"loss": 1.6633,
|
|
"mean_token_accuracy": 0.6524822831153869,
|
|
"num_tokens": 766651894.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.135711357113571,
|
|
"grad_norm": 0.1808580791618453,
|
|
"learning_rate": 2.2280541073693114e-06,
|
|
"loss": 1.6516,
|
|
"mean_token_accuracy": 0.6560776650905609,
|
|
"num_tokens": 767584101.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 1.137078037447041,
|
|
"grad_norm": 0.23935696971409517,
|
|
"learning_rate": 2.2245314921797945e-06,
|
|
"loss": 1.7061,
|
|
"mean_token_accuracy": 0.6466449081897736,
|
|
"num_tokens": 768517890.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 1.1384447177805113,
|
|
"grad_norm": 0.21676469038354396,
|
|
"learning_rate": 2.2210088769902777e-06,
|
|
"loss": 1.6774,
|
|
"mean_token_accuracy": 0.6521164774894714,
|
|
"num_tokens": 769466022.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 1.5171875,
|
|
"epoch": 1.1398113981139812,
|
|
"grad_norm": 0.2825398500851095,
|
|
"learning_rate": 2.217486261800761e-06,
|
|
"loss": 1.5168,
|
|
"mean_token_accuracy": 0.6775658190250397,
|
|
"num_tokens": 770337870.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.1411780784474512,
|
|
"grad_norm": 0.18154562931546794,
|
|
"learning_rate": 2.2139636466112443e-06,
|
|
"loss": 1.6722,
|
|
"mean_token_accuracy": 0.6513801634311676,
|
|
"num_tokens": 771296791.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.1425447587809212,
|
|
"grad_norm": 0.21009227812126216,
|
|
"learning_rate": 2.2104410314217275e-06,
|
|
"loss": 1.6176,
|
|
"mean_token_accuracy": 0.6618570029735565,
|
|
"num_tokens": 772208735.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 1.7359375,
|
|
"epoch": 1.1439114391143912,
|
|
"grad_norm": 0.24051980210494758,
|
|
"learning_rate": 2.206918416232211e-06,
|
|
"loss": 1.7424,
|
|
"mean_token_accuracy": 0.641651701927185,
|
|
"num_tokens": 773154162.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 1.71875,
|
|
"epoch": 1.1452781194478612,
|
|
"grad_norm": 0.41189042634963796,
|
|
"learning_rate": 2.203395801042694e-06,
|
|
"loss": 1.7114,
|
|
"mean_token_accuracy": 0.6448900997638702,
|
|
"num_tokens": 774097051.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.1466447997813312,
|
|
"grad_norm": 0.3019869249740876,
|
|
"learning_rate": 2.1998731858531777e-06,
|
|
"loss": 1.6525,
|
|
"mean_token_accuracy": 0.6555681884288788,
|
|
"num_tokens": 775075954.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 1.7,
|
|
"epoch": 1.1480114801148011,
|
|
"grad_norm": 0.23491896081847705,
|
|
"learning_rate": 2.196350570663661e-06,
|
|
"loss": 1.718,
|
|
"mean_token_accuracy": 0.6474784672260284,
|
|
"num_tokens": 776010837.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 1.1493781604482711,
|
|
"grad_norm": 0.3072018838248499,
|
|
"learning_rate": 2.192827955474144e-06,
|
|
"loss": 1.6526,
|
|
"mean_token_accuracy": 0.6584468364715577,
|
|
"num_tokens": 776912242.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 1.5875,
|
|
"epoch": 1.150744840781741,
|
|
"grad_norm": 0.2254405706599641,
|
|
"learning_rate": 2.1893053402846275e-06,
|
|
"loss": 1.5958,
|
|
"mean_token_accuracy": 0.6675058543682099,
|
|
"num_tokens": 777842324.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 1.152111521115211,
|
|
"grad_norm": 0.23979167519361935,
|
|
"learning_rate": 2.1857827250951106e-06,
|
|
"loss": 1.7238,
|
|
"mean_token_accuracy": 0.6470083832740784,
|
|
"num_tokens": 778797990.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 1.153478201448681,
|
|
"grad_norm": 0.18497477420059294,
|
|
"learning_rate": 2.182260109905594e-06,
|
|
"loss": 1.7284,
|
|
"mean_token_accuracy": 0.6429442822933197,
|
|
"num_tokens": 779720784.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.154844881782151,
|
|
"grad_norm": 0.4804902935294741,
|
|
"learning_rate": 2.1787374947160777e-06,
|
|
"loss": 1.6554,
|
|
"mean_token_accuracy": 0.6592356443405152,
|
|
"num_tokens": 780627644.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.156211562115621,
|
|
"grad_norm": 0.17381982430143936,
|
|
"learning_rate": 2.175214879526561e-06,
|
|
"loss": 1.6308,
|
|
"mean_token_accuracy": 0.6579341113567352,
|
|
"num_tokens": 781583808.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 1.1575782424490912,
|
|
"grad_norm": 0.22325608881438033,
|
|
"learning_rate": 2.171692264337044e-06,
|
|
"loss": 1.6656,
|
|
"mean_token_accuracy": 0.6543825566768646,
|
|
"num_tokens": 782490567.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.1589449227825612,
|
|
"grad_norm": 0.8608020449399852,
|
|
"learning_rate": 2.1681696491475275e-06,
|
|
"loss": 1.6188,
|
|
"mean_token_accuracy": 0.6610362291336059,
|
|
"num_tokens": 783390356.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.1603116031160312,
|
|
"grad_norm": 0.20706196699553134,
|
|
"learning_rate": 2.1646470339580106e-06,
|
|
"loss": 1.6461,
|
|
"mean_token_accuracy": 0.6574566960334778,
|
|
"num_tokens": 784371960.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.1616782834495012,
|
|
"grad_norm": 0.2343932860326484,
|
|
"learning_rate": 2.1611244187684937e-06,
|
|
"loss": 1.6434,
|
|
"mean_token_accuracy": 0.6589827299118042,
|
|
"num_tokens": 785282575.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.1630449637829712,
|
|
"grad_norm": 0.3084172752334836,
|
|
"learning_rate": 2.1576018035789773e-06,
|
|
"loss": 1.653,
|
|
"mean_token_accuracy": 0.6558621168136597,
|
|
"num_tokens": 786240840.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.1644116441164412,
|
|
"grad_norm": 0.2258016433255694,
|
|
"learning_rate": 2.1540791883894604e-06,
|
|
"loss": 1.588,
|
|
"mean_token_accuracy": 0.6639123439788819,
|
|
"num_tokens": 787109160.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.1657783244499111,
|
|
"grad_norm": 0.22514243799299077,
|
|
"learning_rate": 2.150556573199944e-06,
|
|
"loss": 1.7041,
|
|
"mean_token_accuracy": 0.6509670376777649,
|
|
"num_tokens": 788017068.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.1671450047833811,
|
|
"grad_norm": 0.20321566229597834,
|
|
"learning_rate": 2.147033958010427e-06,
|
|
"loss": 1.6318,
|
|
"mean_token_accuracy": 0.6584530770778656,
|
|
"num_tokens": 788933317.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 1.168511685116851,
|
|
"grad_norm": 0.2637880278992194,
|
|
"learning_rate": 2.14351134282091e-06,
|
|
"loss": 1.668,
|
|
"mean_token_accuracy": 0.6530550599098206,
|
|
"num_tokens": 789878899.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.169878365450321,
|
|
"grad_norm": 0.16993835744516453,
|
|
"learning_rate": 2.1399887276313937e-06,
|
|
"loss": 1.5977,
|
|
"mean_token_accuracy": 0.6655386388301849,
|
|
"num_tokens": 790831768.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 1.1712450457837913,
|
|
"grad_norm": 0.22356739641617784,
|
|
"learning_rate": 2.136466112441877e-06,
|
|
"loss": 1.6944,
|
|
"mean_token_accuracy": 0.6515520215034485,
|
|
"num_tokens": 791777294.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.1726117261172613,
|
|
"grad_norm": 0.20976905990984537,
|
|
"learning_rate": 2.1329434972523604e-06,
|
|
"loss": 1.6521,
|
|
"mean_token_accuracy": 0.6559103310108185,
|
|
"num_tokens": 792700537.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.1739784064507313,
|
|
"grad_norm": 0.32238664257210203,
|
|
"learning_rate": 2.1294208820628435e-06,
|
|
"loss": 1.6337,
|
|
"mean_token_accuracy": 0.6589359760284423,
|
|
"num_tokens": 793599743.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 1.70625,
|
|
"epoch": 1.1753450867842012,
|
|
"grad_norm": 0.2298434987329029,
|
|
"learning_rate": 2.125898266873327e-06,
|
|
"loss": 1.7171,
|
|
"mean_token_accuracy": 0.6433700501918793,
|
|
"num_tokens": 794573530.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 1.6984375,
|
|
"epoch": 1.1767117671176712,
|
|
"grad_norm": 0.24361768592571556,
|
|
"learning_rate": 2.12237565168381e-06,
|
|
"loss": 1.7129,
|
|
"mean_token_accuracy": 0.6471936821937561,
|
|
"num_tokens": 795509701.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.1780784474511412,
|
|
"grad_norm": 0.2612440069398371,
|
|
"learning_rate": 2.1188530364942938e-06,
|
|
"loss": 1.6695,
|
|
"mean_token_accuracy": 0.6525741815567017,
|
|
"num_tokens": 796433588.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.1794451277846112,
|
|
"grad_norm": 0.20303096702404172,
|
|
"learning_rate": 2.115330421304777e-06,
|
|
"loss": 1.5875,
|
|
"mean_token_accuracy": 0.6639281928539276,
|
|
"num_tokens": 797350364.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.1808118081180812,
|
|
"grad_norm": 0.2060955816359563,
|
|
"learning_rate": 2.1118078061152604e-06,
|
|
"loss": 1.6554,
|
|
"mean_token_accuracy": 0.6594086289405823,
|
|
"num_tokens": 798320124.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.1821784884515512,
|
|
"grad_norm": 0.2834408475261294,
|
|
"learning_rate": 2.1082851909257435e-06,
|
|
"loss": 1.6574,
|
|
"mean_token_accuracy": 0.6556096851825715,
|
|
"num_tokens": 799256917.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.1835451687850211,
|
|
"grad_norm": 0.18469091271511165,
|
|
"learning_rate": 2.1047625757362267e-06,
|
|
"loss": 1.6319,
|
|
"mean_token_accuracy": 0.659127014875412,
|
|
"num_tokens": 800205993.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.1849118491184911,
|
|
"grad_norm": 0.2078426490551358,
|
|
"learning_rate": 2.1012399605467102e-06,
|
|
"loss": 1.6765,
|
|
"mean_token_accuracy": 0.6536607444286346,
|
|
"num_tokens": 801097875.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 1.186278529451961,
|
|
"grad_norm": 0.2386226564759819,
|
|
"learning_rate": 2.0977173453571933e-06,
|
|
"loss": 1.7023,
|
|
"mean_token_accuracy": 0.6518602788448333,
|
|
"num_tokens": 801983394.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 1.187645209785431,
|
|
"grad_norm": 0.21622746915311986,
|
|
"learning_rate": 2.0941947301676765e-06,
|
|
"loss": 1.7011,
|
|
"mean_token_accuracy": 0.649648004770279,
|
|
"num_tokens": 802863390.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.189011890118901,
|
|
"grad_norm": 0.2079172089196483,
|
|
"learning_rate": 2.09067211497816e-06,
|
|
"loss": 1.6073,
|
|
"mean_token_accuracy": 0.6601165175437927,
|
|
"num_tokens": 803795285.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.1903785704523713,
|
|
"grad_norm": 0.36945607264996044,
|
|
"learning_rate": 2.087149499788643e-06,
|
|
"loss": 1.6405,
|
|
"mean_token_accuracy": 0.6547869086265564,
|
|
"num_tokens": 804709659.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.1917452507858413,
|
|
"grad_norm": 0.19774174671380912,
|
|
"learning_rate": 2.0836268845991263e-06,
|
|
"loss": 1.6988,
|
|
"mean_token_accuracy": 0.6465525329113007,
|
|
"num_tokens": 805627303.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 1.1931119311193112,
|
|
"grad_norm": 0.20592458449059406,
|
|
"learning_rate": 2.08010426940961e-06,
|
|
"loss": 1.6963,
|
|
"mean_token_accuracy": 0.651505845785141,
|
|
"num_tokens": 806581835.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.1944786114527812,
|
|
"grad_norm": 0.2271462540559992,
|
|
"learning_rate": 2.076581654220093e-06,
|
|
"loss": 1.6756,
|
|
"mean_token_accuracy": 0.6538717925548554,
|
|
"num_tokens": 807513343.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.1958452917862512,
|
|
"grad_norm": 0.26854071093317183,
|
|
"learning_rate": 2.0730590390305765e-06,
|
|
"loss": 1.6168,
|
|
"mean_token_accuracy": 0.6625005424022674,
|
|
"num_tokens": 808427719.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 1.67421875,
|
|
"epoch": 1.1972119721197212,
|
|
"grad_norm": 0.21486993569526713,
|
|
"learning_rate": 2.0695364238410596e-06,
|
|
"loss": 1.6778,
|
|
"mean_token_accuracy": 0.6524321258068084,
|
|
"num_tokens": 809402793.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 1.6921875,
|
|
"epoch": 1.1985786524531912,
|
|
"grad_norm": 0.24966257550485255,
|
|
"learning_rate": 2.066013808651543e-06,
|
|
"loss": 1.7134,
|
|
"mean_token_accuracy": 0.6455506265163422,
|
|
"num_tokens": 810321227.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 1.1999453327866612,
|
|
"grad_norm": 0.19941805250441924,
|
|
"learning_rate": 2.0624911934620263e-06,
|
|
"loss": 1.681,
|
|
"mean_token_accuracy": 0.651288878917694,
|
|
"num_tokens": 811205990.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 1.2013120131201311,
|
|
"grad_norm": 0.18098633653735322,
|
|
"learning_rate": 2.05896857827251e-06,
|
|
"loss": 1.7176,
|
|
"mean_token_accuracy": 0.6431676924228669,
|
|
"num_tokens": 812168419.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.2026786934536011,
|
|
"grad_norm": 0.20822044307548077,
|
|
"learning_rate": 2.055445963082993e-06,
|
|
"loss": 1.6295,
|
|
"mean_token_accuracy": 0.6620640456676483,
|
|
"num_tokens": 813080293.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.2040453737870713,
|
|
"grad_norm": 0.18825121277939905,
|
|
"learning_rate": 2.0519233478934765e-06,
|
|
"loss": 1.6376,
|
|
"mean_token_accuracy": 0.6579415977001191,
|
|
"num_tokens": 814025818.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.2054120541205413,
|
|
"grad_norm": 0.16168845008137853,
|
|
"learning_rate": 2.0484007327039596e-06,
|
|
"loss": 1.6867,
|
|
"mean_token_accuracy": 0.6490220248699188,
|
|
"num_tokens": 814932249.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.2067787344540113,
|
|
"grad_norm": 0.28686982394117166,
|
|
"learning_rate": 2.0448781175144427e-06,
|
|
"loss": 1.6513,
|
|
"mean_token_accuracy": 0.6568827688694,
|
|
"num_tokens": 815852970.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.2081454147874813,
|
|
"grad_norm": 0.20780293978266942,
|
|
"learning_rate": 2.0413555023249263e-06,
|
|
"loss": 1.6457,
|
|
"mean_token_accuracy": 0.6577168047428131,
|
|
"num_tokens": 816785239.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 1.2095120951209513,
|
|
"grad_norm": 0.22971335874144547,
|
|
"learning_rate": 2.0378328871354094e-06,
|
|
"loss": 1.6479,
|
|
"mean_token_accuracy": 0.6549285113811493,
|
|
"num_tokens": 817737467.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.2108787754544212,
|
|
"grad_norm": 0.2622969849533522,
|
|
"learning_rate": 2.034310271945893e-06,
|
|
"loss": 1.6753,
|
|
"mean_token_accuracy": 0.6549736022949219,
|
|
"num_tokens": 818706509.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.2122454557878912,
|
|
"grad_norm": 0.1583563424706435,
|
|
"learning_rate": 2.030787656756376e-06,
|
|
"loss": 1.6278,
|
|
"mean_token_accuracy": 0.6570086121559143,
|
|
"num_tokens": 819615233.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.2136121361213612,
|
|
"grad_norm": 0.19848301862389592,
|
|
"learning_rate": 2.027265041566859e-06,
|
|
"loss": 1.6224,
|
|
"mean_token_accuracy": 0.6616878390312195,
|
|
"num_tokens": 820538306.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.2149788164548312,
|
|
"grad_norm": 0.28298158928773687,
|
|
"learning_rate": 2.0237424263773427e-06,
|
|
"loss": 1.626,
|
|
"mean_token_accuracy": 0.6619326829910278,
|
|
"num_tokens": 821425988.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 1.56640625,
|
|
"epoch": 1.2163454967883012,
|
|
"grad_norm": 0.19146836981687443,
|
|
"learning_rate": 2.020219811187826e-06,
|
|
"loss": 1.5758,
|
|
"mean_token_accuracy": 0.6671996116638184,
|
|
"num_tokens": 822375356.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 1.690625,
|
|
"epoch": 1.2177121771217712,
|
|
"grad_norm": 0.30981838464985567,
|
|
"learning_rate": 2.0166971959983094e-06,
|
|
"loss": 1.7136,
|
|
"mean_token_accuracy": 0.644020164012909,
|
|
"num_tokens": 823310773.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.2190788574552411,
|
|
"grad_norm": 0.17808772160940325,
|
|
"learning_rate": 2.0131745808087925e-06,
|
|
"loss": 1.6372,
|
|
"mean_token_accuracy": 0.656511414051056,
|
|
"num_tokens": 824241438.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.2204455377887111,
|
|
"grad_norm": 0.26546167978759383,
|
|
"learning_rate": 2.009651965619276e-06,
|
|
"loss": 1.6628,
|
|
"mean_token_accuracy": 0.6523425161838532,
|
|
"num_tokens": 825163370.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 1.72109375,
|
|
"epoch": 1.221812218122181,
|
|
"grad_norm": 0.2017532339607059,
|
|
"learning_rate": 2.0061293504297592e-06,
|
|
"loss": 1.7323,
|
|
"mean_token_accuracy": 0.6434034109115601,
|
|
"num_tokens": 826106691.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.2231788984556513,
|
|
"grad_norm": 0.22068158543252683,
|
|
"learning_rate": 2.0026067352402428e-06,
|
|
"loss": 1.6548,
|
|
"mean_token_accuracy": 0.6554613053798676,
|
|
"num_tokens": 827032921.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 1.2245455787891213,
|
|
"grad_norm": 0.1636974418710655,
|
|
"learning_rate": 1.999084120050726e-06,
|
|
"loss": 1.6974,
|
|
"mean_token_accuracy": 0.6490798175334931,
|
|
"num_tokens": 827961697.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.2259122591225913,
|
|
"grad_norm": 0.2094449677647114,
|
|
"learning_rate": 1.9955615048612094e-06,
|
|
"loss": 1.5959,
|
|
"mean_token_accuracy": 0.6648080348968506,
|
|
"num_tokens": 828882065.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 1.2272789394560613,
|
|
"grad_norm": 0.35245869487058423,
|
|
"learning_rate": 1.9920388896716926e-06,
|
|
"loss": 1.6538,
|
|
"mean_token_accuracy": 0.6561968445777893,
|
|
"num_tokens": 829818247.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.2286456197895312,
|
|
"grad_norm": 0.2305103041008018,
|
|
"learning_rate": 1.9885162744821757e-06,
|
|
"loss": 1.6643,
|
|
"mean_token_accuracy": 0.6569099187850952,
|
|
"num_tokens": 830745574.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 1.2300123001230012,
|
|
"grad_norm": 0.1702515857970199,
|
|
"learning_rate": 1.9849936592926592e-06,
|
|
"loss": 1.6488,
|
|
"mean_token_accuracy": 0.656689727306366,
|
|
"num_tokens": 831606782.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.2313789804564712,
|
|
"grad_norm": 0.20110860274244302,
|
|
"learning_rate": 1.9814710441031424e-06,
|
|
"loss": 1.6432,
|
|
"mean_token_accuracy": 0.6562348902225494,
|
|
"num_tokens": 832516152.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.2327456607899412,
|
|
"grad_norm": 0.2847412295156812,
|
|
"learning_rate": 1.9779484289136255e-06,
|
|
"loss": 1.6135,
|
|
"mean_token_accuracy": 0.664428836107254,
|
|
"num_tokens": 833415792.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.2341123411234112,
|
|
"grad_norm": 0.28661449316584486,
|
|
"learning_rate": 1.974425813724109e-06,
|
|
"loss": 1.6667,
|
|
"mean_token_accuracy": 0.6542934656143189,
|
|
"num_tokens": 834352285.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.2354790214568812,
|
|
"grad_norm": 0.21379852678989872,
|
|
"learning_rate": 1.970903198534592e-06,
|
|
"loss": 1.6515,
|
|
"mean_token_accuracy": 0.6561932861804962,
|
|
"num_tokens": 835294889.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.2368457017903514,
|
|
"grad_norm": 0.22906715023448204,
|
|
"learning_rate": 1.9673805833450753e-06,
|
|
"loss": 1.6645,
|
|
"mean_token_accuracy": 0.6515011131763458,
|
|
"num_tokens": 836233174.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.2382123821238213,
|
|
"grad_norm": 0.1714175867540852,
|
|
"learning_rate": 1.963857968155559e-06,
|
|
"loss": 1.6713,
|
|
"mean_token_accuracy": 0.6523460984230042,
|
|
"num_tokens": 837179158.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.2395790624572913,
|
|
"grad_norm": 0.6545995773322464,
|
|
"learning_rate": 1.960335352966042e-06,
|
|
"loss": 1.6448,
|
|
"mean_token_accuracy": 0.6607184469699859,
|
|
"num_tokens": 838104191.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.2409457427907613,
|
|
"grad_norm": 0.2295522650670773,
|
|
"learning_rate": 1.9568127377765255e-06,
|
|
"loss": 1.6548,
|
|
"mean_token_accuracy": 0.6530137896537781,
|
|
"num_tokens": 839054627.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 1.2423124231242313,
|
|
"grad_norm": 0.20111090387056993,
|
|
"learning_rate": 1.9532901225870086e-06,
|
|
"loss": 1.6676,
|
|
"mean_token_accuracy": 0.6540429472923279,
|
|
"num_tokens": 840025729.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.2436791034577013,
|
|
"grad_norm": 0.23608021542429689,
|
|
"learning_rate": 1.949767507397492e-06,
|
|
"loss": 1.6157,
|
|
"mean_token_accuracy": 0.6590730607509613,
|
|
"num_tokens": 840925190.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.2450457837911713,
|
|
"grad_norm": 0.2398102346521419,
|
|
"learning_rate": 1.9462448922079753e-06,
|
|
"loss": 1.6853,
|
|
"mean_token_accuracy": 0.6505544245243072,
|
|
"num_tokens": 841841514.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.2464124641246412,
|
|
"grad_norm": 0.17532103845645028,
|
|
"learning_rate": 1.942722277018459e-06,
|
|
"loss": 1.6389,
|
|
"mean_token_accuracy": 0.6606233417987823,
|
|
"num_tokens": 842738588.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 1.2477791444581112,
|
|
"grad_norm": 0.18932369298340038,
|
|
"learning_rate": 1.939199661828942e-06,
|
|
"loss": 1.7063,
|
|
"mean_token_accuracy": 0.6459846735000611,
|
|
"num_tokens": 843649588.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 1.54375,
|
|
"epoch": 1.2491458247915812,
|
|
"grad_norm": 0.2075346686795944,
|
|
"learning_rate": 1.9356770466394255e-06,
|
|
"loss": 1.5511,
|
|
"mean_token_accuracy": 0.6714156746864319,
|
|
"num_tokens": 844505456.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.2505125051250512,
|
|
"grad_norm": 0.19443070312865782,
|
|
"learning_rate": 1.9321544314499086e-06,
|
|
"loss": 1.5954,
|
|
"mean_token_accuracy": 0.6642876744270325,
|
|
"num_tokens": 845420117.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.2518791854585212,
|
|
"grad_norm": 0.25893868547540594,
|
|
"learning_rate": 1.9286318162603917e-06,
|
|
"loss": 1.6629,
|
|
"mean_token_accuracy": 0.652742224931717,
|
|
"num_tokens": 846361663.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.2532458657919912,
|
|
"grad_norm": 0.3645495157302626,
|
|
"learning_rate": 1.9251092010708753e-06,
|
|
"loss": 1.5935,
|
|
"mean_token_accuracy": 0.6672346711158752,
|
|
"num_tokens": 847283407.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.2546125461254611,
|
|
"grad_norm": 0.20083768750784634,
|
|
"learning_rate": 1.9215865858813584e-06,
|
|
"loss": 1.6652,
|
|
"mean_token_accuracy": 0.6522323966026307,
|
|
"num_tokens": 848201835.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.2559792264589311,
|
|
"grad_norm": 0.20255458397377624,
|
|
"learning_rate": 1.918063970691842e-06,
|
|
"loss": 1.6714,
|
|
"mean_token_accuracy": 0.6512964725494385,
|
|
"num_tokens": 849137408.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 1.70546875,
|
|
"epoch": 1.2573459067924013,
|
|
"grad_norm": 0.22427318202184623,
|
|
"learning_rate": 1.914541355502325e-06,
|
|
"loss": 1.6799,
|
|
"mean_token_accuracy": 0.6506673991680145,
|
|
"num_tokens": 850016286.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.2587125871258713,
|
|
"grad_norm": 0.486782948581546,
|
|
"learning_rate": 1.9110187403128082e-06,
|
|
"loss": 1.6264,
|
|
"mean_token_accuracy": 0.6580170094966888,
|
|
"num_tokens": 850929735.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.2600792674593413,
|
|
"grad_norm": 0.20604226391758068,
|
|
"learning_rate": 1.9074961251232918e-06,
|
|
"loss": 1.6253,
|
|
"mean_token_accuracy": 0.6618473947048187,
|
|
"num_tokens": 851902043.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.2614459477928113,
|
|
"grad_norm": 0.2575498709968255,
|
|
"learning_rate": 1.903973509933775e-06,
|
|
"loss": 1.6216,
|
|
"mean_token_accuracy": 0.6577454745769501,
|
|
"num_tokens": 852807785.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 1.2628126281262813,
|
|
"grad_norm": 0.17429827140233112,
|
|
"learning_rate": 1.9004508947442582e-06,
|
|
"loss": 1.6999,
|
|
"mean_token_accuracy": 0.6482182502746582,
|
|
"num_tokens": 853704790.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.2641793084597512,
|
|
"grad_norm": 0.16949384399767042,
|
|
"learning_rate": 1.8969282795547418e-06,
|
|
"loss": 1.6246,
|
|
"mean_token_accuracy": 0.659442687034607,
|
|
"num_tokens": 854605440.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 1.65859375,
|
|
"epoch": 1.2655459887932212,
|
|
"grad_norm": 0.20932812355587926,
|
|
"learning_rate": 1.8934056643652249e-06,
|
|
"loss": 1.6778,
|
|
"mean_token_accuracy": 0.6515567123889923,
|
|
"num_tokens": 855518339.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.2669126691266912,
|
|
"grad_norm": 0.21279538236033893,
|
|
"learning_rate": 1.889883049175708e-06,
|
|
"loss": 1.6404,
|
|
"mean_token_accuracy": 0.6601641833782196,
|
|
"num_tokens": 856427407.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.2682793494601612,
|
|
"grad_norm": 0.23508240785315832,
|
|
"learning_rate": 1.8863604339861916e-06,
|
|
"loss": 1.6032,
|
|
"mean_token_accuracy": 0.6612719237804413,
|
|
"num_tokens": 857348476.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.2696460297936314,
|
|
"grad_norm": 0.21647131687517915,
|
|
"learning_rate": 1.8828378187966747e-06,
|
|
"loss": 1.6501,
|
|
"mean_token_accuracy": 0.6573628544807434,
|
|
"num_tokens": 858289826.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.2710127101271014,
|
|
"grad_norm": 0.19538763171073942,
|
|
"learning_rate": 1.8793152036071582e-06,
|
|
"loss": 1.6707,
|
|
"mean_token_accuracy": 0.654766857624054,
|
|
"num_tokens": 859204799.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.2723793904605714,
|
|
"grad_norm": 0.16380442243801943,
|
|
"learning_rate": 1.8757925884176414e-06,
|
|
"loss": 1.6699,
|
|
"mean_token_accuracy": 0.6501084387302398,
|
|
"num_tokens": 860137169.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 1.7296875,
|
|
"epoch": 1.2737460707940413,
|
|
"grad_norm": 0.359461298514425,
|
|
"learning_rate": 1.8722699732281247e-06,
|
|
"loss": 1.7493,
|
|
"mean_token_accuracy": 0.6382077157497406,
|
|
"num_tokens": 861116541.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.2751127511275113,
|
|
"grad_norm": 0.2531620362980635,
|
|
"learning_rate": 1.868747358038608e-06,
|
|
"loss": 1.6898,
|
|
"mean_token_accuracy": 0.6499089181423188,
|
|
"num_tokens": 862025805.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.2764794314609813,
|
|
"grad_norm": 0.49769186115864356,
|
|
"learning_rate": 1.8652247428490914e-06,
|
|
"loss": 1.6099,
|
|
"mean_token_accuracy": 0.6651976644992829,
|
|
"num_tokens": 862947799.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.2778461117944513,
|
|
"grad_norm": 0.17735358883490246,
|
|
"learning_rate": 1.8617021276595745e-06,
|
|
"loss": 1.6703,
|
|
"mean_token_accuracy": 0.6540926039218903,
|
|
"num_tokens": 863856635.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.2792127921279213,
|
|
"grad_norm": 0.3109169160009544,
|
|
"learning_rate": 1.858179512470058e-06,
|
|
"loss": 1.6321,
|
|
"mean_token_accuracy": 0.6617400705814361,
|
|
"num_tokens": 864777045.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.2805794724613913,
|
|
"grad_norm": 0.15947573892269093,
|
|
"learning_rate": 1.8546568972805412e-06,
|
|
"loss": 1.6004,
|
|
"mean_token_accuracy": 0.6634768426418305,
|
|
"num_tokens": 865756055.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.2819461527948612,
|
|
"grad_norm": 0.20448921577961396,
|
|
"learning_rate": 1.8511342820910245e-06,
|
|
"loss": 1.6092,
|
|
"mean_token_accuracy": 0.6609838485717774,
|
|
"num_tokens": 866672730.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.2833128331283312,
|
|
"grad_norm": 0.22923748360747523,
|
|
"learning_rate": 1.8476116669015078e-06,
|
|
"loss": 1.6169,
|
|
"mean_token_accuracy": 0.663374525308609,
|
|
"num_tokens": 867610399.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.2846795134618012,
|
|
"grad_norm": 0.24834600053973244,
|
|
"learning_rate": 1.8440890517119912e-06,
|
|
"loss": 1.5767,
|
|
"mean_token_accuracy": 0.6671038806438446,
|
|
"num_tokens": 868542080.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.2860461937952712,
|
|
"grad_norm": 0.2237936614360355,
|
|
"learning_rate": 1.8405664365224743e-06,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.661199814081192,
|
|
"num_tokens": 869428363.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 1.2874128741287412,
|
|
"grad_norm": 0.19869735470659045,
|
|
"learning_rate": 1.8370438213329578e-06,
|
|
"loss": 1.7213,
|
|
"mean_token_accuracy": 0.645433908700943,
|
|
"num_tokens": 870403983.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.2887795544622112,
|
|
"grad_norm": 0.1810193934487422,
|
|
"learning_rate": 1.833521206143441e-06,
|
|
"loss": 1.6638,
|
|
"mean_token_accuracy": 0.6547651827335358,
|
|
"num_tokens": 871331635.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.2901462347956814,
|
|
"grad_norm": 0.159488117007368,
|
|
"learning_rate": 1.8299985909539245e-06,
|
|
"loss": 1.6846,
|
|
"mean_token_accuracy": 0.6496282041072845,
|
|
"num_tokens": 872219987.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.2915129151291513,
|
|
"grad_norm": 0.4736131028565075,
|
|
"learning_rate": 1.8264759757644076e-06,
|
|
"loss": 1.6559,
|
|
"mean_token_accuracy": 0.6542121112346649,
|
|
"num_tokens": 873135690.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.2928795954626213,
|
|
"grad_norm": 0.6277015814842704,
|
|
"learning_rate": 1.822953360574891e-06,
|
|
"loss": 1.5994,
|
|
"mean_token_accuracy": 0.6637328267097473,
|
|
"num_tokens": 874017547.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.2942462757960913,
|
|
"grad_norm": 0.2037676039971423,
|
|
"learning_rate": 1.8194307453853743e-06,
|
|
"loss": 1.5937,
|
|
"mean_token_accuracy": 0.6655194461345673,
|
|
"num_tokens": 874929826.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 1.2956129561295613,
|
|
"grad_norm": 0.2895571509744626,
|
|
"learning_rate": 1.8159081301958576e-06,
|
|
"loss": 1.6943,
|
|
"mean_token_accuracy": 0.6517538070678711,
|
|
"num_tokens": 875872578.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.2969796364630313,
|
|
"grad_norm": 0.15470683542165786,
|
|
"learning_rate": 1.8123855150063408e-06,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.657831472158432,
|
|
"num_tokens": 876811646.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.2983463167965013,
|
|
"grad_norm": 0.17880538955377323,
|
|
"learning_rate": 1.8088628998168243e-06,
|
|
"loss": 1.6497,
|
|
"mean_token_accuracy": 0.6573770582675934,
|
|
"num_tokens": 877715948.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.2997129971299712,
|
|
"grad_norm": 0.3041723264121226,
|
|
"learning_rate": 1.8053402846273074e-06,
|
|
"loss": 1.5772,
|
|
"mean_token_accuracy": 0.6652538180351257,
|
|
"num_tokens": 878628677.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.3010796774634412,
|
|
"grad_norm": 0.22178491078186383,
|
|
"learning_rate": 1.8018176694377906e-06,
|
|
"loss": 1.5711,
|
|
"mean_token_accuracy": 0.6676675915718079,
|
|
"num_tokens": 879532012.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.3024463577969114,
|
|
"grad_norm": 0.2114713089504065,
|
|
"learning_rate": 1.798295054248274e-06,
|
|
"loss": 1.6891,
|
|
"mean_token_accuracy": 0.6511152267456055,
|
|
"num_tokens": 880422072.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.3038130381303814,
|
|
"grad_norm": 0.20799652041017186,
|
|
"learning_rate": 1.7947724390587572e-06,
|
|
"loss": 1.6157,
|
|
"mean_token_accuracy": 0.6586639106273651,
|
|
"num_tokens": 881317032.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 1.3051797184638514,
|
|
"grad_norm": 0.248536082583271,
|
|
"learning_rate": 1.7912498238692408e-06,
|
|
"loss": 1.7046,
|
|
"mean_token_accuracy": 0.6486743450164795,
|
|
"num_tokens": 882260746.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.3065463987973214,
|
|
"grad_norm": 0.17189312474021376,
|
|
"learning_rate": 1.7877272086797239e-06,
|
|
"loss": 1.6081,
|
|
"mean_token_accuracy": 0.6640940308570862,
|
|
"num_tokens": 883187293.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.3079130791307914,
|
|
"grad_norm": 0.15776928635459278,
|
|
"learning_rate": 1.7842045934902072e-06,
|
|
"loss": 1.6676,
|
|
"mean_token_accuracy": 0.6512872517108917,
|
|
"num_tokens": 884127306.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.3092797594642613,
|
|
"grad_norm": 0.20985630489304255,
|
|
"learning_rate": 1.7806819783006908e-06,
|
|
"loss": 1.6384,
|
|
"mean_token_accuracy": 0.6588730633258819,
|
|
"num_tokens": 885045930.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.3106464397977313,
|
|
"grad_norm": 0.21045013101395268,
|
|
"learning_rate": 1.777159363111174e-06,
|
|
"loss": 1.6424,
|
|
"mean_token_accuracy": 0.6568190157413483,
|
|
"num_tokens": 885974036.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.3120131201312013,
|
|
"grad_norm": 0.254786660134225,
|
|
"learning_rate": 1.773636747921657e-06,
|
|
"loss": 1.606,
|
|
"mean_token_accuracy": 0.6617513597011566,
|
|
"num_tokens": 886851777.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.3133798004646713,
|
|
"grad_norm": 0.17796507810001302,
|
|
"learning_rate": 1.7701141327321406e-06,
|
|
"loss": 1.6647,
|
|
"mean_token_accuracy": 0.6552273869514466,
|
|
"num_tokens": 887812587.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.3147464807981413,
|
|
"grad_norm": 1.631442978515122,
|
|
"learning_rate": 1.7665915175426237e-06,
|
|
"loss": 1.677,
|
|
"mean_token_accuracy": 0.6501412391662598,
|
|
"num_tokens": 888788173.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.3161131611316113,
|
|
"grad_norm": 0.2592113227470123,
|
|
"learning_rate": 1.763068902353107e-06,
|
|
"loss": 1.5898,
|
|
"mean_token_accuracy": 0.6631907820701599,
|
|
"num_tokens": 889686324.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.3174798414650812,
|
|
"grad_norm": 0.4000378477101872,
|
|
"learning_rate": 1.7595462871635904e-06,
|
|
"loss": 1.6597,
|
|
"mean_token_accuracy": 0.6506197988986969,
|
|
"num_tokens": 890574774.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.3188465217985512,
|
|
"grad_norm": 0.22444164318067952,
|
|
"learning_rate": 1.7560236719740737e-06,
|
|
"loss": 1.6031,
|
|
"mean_token_accuracy": 0.6622675061225891,
|
|
"num_tokens": 891510483.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.3202132021320212,
|
|
"grad_norm": 0.2159139142329402,
|
|
"learning_rate": 1.752501056784557e-06,
|
|
"loss": 1.6201,
|
|
"mean_token_accuracy": 0.6652886092662811,
|
|
"num_tokens": 892466295.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.3215798824654912,
|
|
"grad_norm": 0.31986561508709227,
|
|
"learning_rate": 1.7489784415950404e-06,
|
|
"loss": 1.6184,
|
|
"mean_token_accuracy": 0.6590754091739655,
|
|
"num_tokens": 893354798.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.3229465627989614,
|
|
"grad_norm": 0.19066574877885106,
|
|
"learning_rate": 1.7454558264055235e-06,
|
|
"loss": 1.5981,
|
|
"mean_token_accuracy": 0.6634791910648346,
|
|
"num_tokens": 894254663.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.3243132431324314,
|
|
"grad_norm": 0.17851287793154413,
|
|
"learning_rate": 1.741933211216007e-06,
|
|
"loss": 1.6235,
|
|
"mean_token_accuracy": 0.6633592784404755,
|
|
"num_tokens": 895151834.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 1.3256799234659014,
|
|
"grad_norm": 0.22921798350579514,
|
|
"learning_rate": 1.7384105960264902e-06,
|
|
"loss": 1.6216,
|
|
"mean_token_accuracy": 0.662970346212387,
|
|
"num_tokens": 896088376.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 1.76640625,
|
|
"epoch": 1.3270466037993713,
|
|
"grad_norm": 0.31789013732255117,
|
|
"learning_rate": 1.7348879808369735e-06,
|
|
"loss": 1.7798,
|
|
"mean_token_accuracy": 0.6330322861671448,
|
|
"num_tokens": 897033254.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.3284132841328413,
|
|
"grad_norm": 0.21005893840876755,
|
|
"learning_rate": 1.7313653656474568e-06,
|
|
"loss": 1.637,
|
|
"mean_token_accuracy": 0.6578575730323791,
|
|
"num_tokens": 897980732.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.3297799644663113,
|
|
"grad_norm": 0.23364506946374117,
|
|
"learning_rate": 1.7278427504579402e-06,
|
|
"loss": 1.6841,
|
|
"mean_token_accuracy": 0.6519119083881378,
|
|
"num_tokens": 898942541.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 1.3311466447997813,
|
|
"grad_norm": 0.17101088137361317,
|
|
"learning_rate": 1.7243201352684233e-06,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.6475227832794189,
|
|
"num_tokens": 899913206.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 1.696875,
|
|
"epoch": 1.3325133251332513,
|
|
"grad_norm": 0.31328253981207554,
|
|
"learning_rate": 1.7207975200789068e-06,
|
|
"loss": 1.7054,
|
|
"mean_token_accuracy": 0.6459831833839417,
|
|
"num_tokens": 900834380.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.3338800054667213,
|
|
"grad_norm": 0.1633312068477104,
|
|
"learning_rate": 1.71727490488939e-06,
|
|
"loss": 1.6667,
|
|
"mean_token_accuracy": 0.652851790189743,
|
|
"num_tokens": 901704105.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.3352466858001915,
|
|
"grad_norm": 0.19932773187341804,
|
|
"learning_rate": 1.7137522896998735e-06,
|
|
"loss": 1.6614,
|
|
"mean_token_accuracy": 0.6528195798397064,
|
|
"num_tokens": 902634899.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.3366133661336614,
|
|
"grad_norm": 0.16696597595497617,
|
|
"learning_rate": 1.7102296745103566e-06,
|
|
"loss": 1.5817,
|
|
"mean_token_accuracy": 0.6692706286907196,
|
|
"num_tokens": 903526813.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.3379800464671314,
|
|
"grad_norm": 0.21087345121646806,
|
|
"learning_rate": 1.7067070593208398e-06,
|
|
"loss": 1.6478,
|
|
"mean_token_accuracy": 0.6568685829639435,
|
|
"num_tokens": 904472649.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.3393467268006014,
|
|
"grad_norm": 0.3373385824709043,
|
|
"learning_rate": 1.7031844441313233e-06,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6583301424980164,
|
|
"num_tokens": 905409801.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.3407134071340714,
|
|
"grad_norm": 0.20982276316807638,
|
|
"learning_rate": 1.6996618289418064e-06,
|
|
"loss": 1.6379,
|
|
"mean_token_accuracy": 0.6593515396118164,
|
|
"num_tokens": 906347737.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.3420800874675414,
|
|
"grad_norm": 0.2292682057517228,
|
|
"learning_rate": 1.6961392137522898e-06,
|
|
"loss": 1.6563,
|
|
"mean_token_accuracy": 0.6535943150520325,
|
|
"num_tokens": 907218513.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.3434467678010114,
|
|
"grad_norm": 0.5235140581692664,
|
|
"learning_rate": 1.6926165985627733e-06,
|
|
"loss": 1.5786,
|
|
"mean_token_accuracy": 0.666674119234085,
|
|
"num_tokens": 908096974.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 1.67265625,
|
|
"epoch": 1.3448134481344813,
|
|
"grad_norm": 0.23806321805450276,
|
|
"learning_rate": 1.6890939833732564e-06,
|
|
"loss": 1.6775,
|
|
"mean_token_accuracy": 0.6489076972007751,
|
|
"num_tokens": 909044993.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.3461801284679513,
|
|
"grad_norm": 0.2049468033178053,
|
|
"learning_rate": 1.6855713681837396e-06,
|
|
"loss": 1.692,
|
|
"mean_token_accuracy": 0.650264424085617,
|
|
"num_tokens": 909982466.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.3475468088014213,
|
|
"grad_norm": 0.18151430467179858,
|
|
"learning_rate": 1.6820487529942231e-06,
|
|
"loss": 1.6217,
|
|
"mean_token_accuracy": 0.6606227636337281,
|
|
"num_tokens": 910893227.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.3489134891348913,
|
|
"grad_norm": 0.36589200027523566,
|
|
"learning_rate": 1.6785261378047062e-06,
|
|
"loss": 1.6112,
|
|
"mean_token_accuracy": 0.6606104433536529,
|
|
"num_tokens": 911811892.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.3502801694683613,
|
|
"grad_norm": 0.28326005998466897,
|
|
"learning_rate": 1.6750035226151898e-06,
|
|
"loss": 1.6201,
|
|
"mean_token_accuracy": 0.6615821242332458,
|
|
"num_tokens": 912692091.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 1.66328125,
|
|
"epoch": 1.3516468498018313,
|
|
"grad_norm": 0.17422008297062547,
|
|
"learning_rate": 1.671480907425673e-06,
|
|
"loss": 1.6624,
|
|
"mean_token_accuracy": 0.6545768320560456,
|
|
"num_tokens": 913632609.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 1.625,
|
|
"epoch": 1.3530135301353012,
|
|
"grad_norm": 0.2811479575571388,
|
|
"learning_rate": 1.6679582922361562e-06,
|
|
"loss": 1.6317,
|
|
"mean_token_accuracy": 0.6597061455249786,
|
|
"num_tokens": 914538308.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.3543802104687712,
|
|
"grad_norm": 0.17978211356343002,
|
|
"learning_rate": 1.6644356770466396e-06,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.6539968132972718,
|
|
"num_tokens": 915473944.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 1.3557468908022414,
|
|
"grad_norm": 0.23554759630689748,
|
|
"learning_rate": 1.660913061857123e-06,
|
|
"loss": 1.6688,
|
|
"mean_token_accuracy": 0.6529323160648346,
|
|
"num_tokens": 916408095.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 1.58125,
|
|
"epoch": 1.3571135711357114,
|
|
"grad_norm": 0.363935605882665,
|
|
"learning_rate": 1.657390446667606e-06,
|
|
"loss": 1.569,
|
|
"mean_token_accuracy": 0.671255248785019,
|
|
"num_tokens": 917332935.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.3584802514691814,
|
|
"grad_norm": 0.2323854209631324,
|
|
"learning_rate": 1.6538678314780896e-06,
|
|
"loss": 1.6319,
|
|
"mean_token_accuracy": 0.6597575187683106,
|
|
"num_tokens": 918240615.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 1.3598469318026514,
|
|
"grad_norm": 0.202006813035657,
|
|
"learning_rate": 1.6503452162885727e-06,
|
|
"loss": 1.735,
|
|
"mean_token_accuracy": 0.6409150958061218,
|
|
"num_tokens": 919135660.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.3612136121361214,
|
|
"grad_norm": 0.15770756244731005,
|
|
"learning_rate": 1.646822601099056e-06,
|
|
"loss": 1.6231,
|
|
"mean_token_accuracy": 0.6597519636154174,
|
|
"num_tokens": 920057548.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.3625802924695913,
|
|
"grad_norm": 0.21432148591048078,
|
|
"learning_rate": 1.6432999859095394e-06,
|
|
"loss": 1.6343,
|
|
"mean_token_accuracy": 0.6593257129192353,
|
|
"num_tokens": 920984564.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 1.3639469728030613,
|
|
"grad_norm": 0.3524774644179013,
|
|
"learning_rate": 1.6397773707200227e-06,
|
|
"loss": 1.7198,
|
|
"mean_token_accuracy": 0.6432790398597718,
|
|
"num_tokens": 921912267.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.3653136531365313,
|
|
"grad_norm": 0.2881333219565065,
|
|
"learning_rate": 1.636254755530506e-06,
|
|
"loss": 1.6411,
|
|
"mean_token_accuracy": 0.6564955651760102,
|
|
"num_tokens": 922795541.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 1.53828125,
|
|
"epoch": 1.3666803334700013,
|
|
"grad_norm": 0.21991929061712537,
|
|
"learning_rate": 1.6327321403409894e-06,
|
|
"loss": 1.542,
|
|
"mean_token_accuracy": 0.6723182201385498,
|
|
"num_tokens": 923706457.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.3680470138034715,
|
|
"grad_norm": 0.25462343383688196,
|
|
"learning_rate": 1.6292095251514725e-06,
|
|
"loss": 1.666,
|
|
"mean_token_accuracy": 0.6544298887252807,
|
|
"num_tokens": 924619324.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 1.3694136941369415,
|
|
"grad_norm": 0.4611545017060456,
|
|
"learning_rate": 1.625686909961956e-06,
|
|
"loss": 1.7089,
|
|
"mean_token_accuracy": 0.6443469762802124,
|
|
"num_tokens": 925550838.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.3707803744704115,
|
|
"grad_norm": 0.26663187376613684,
|
|
"learning_rate": 1.6221642947724392e-06,
|
|
"loss": 1.6452,
|
|
"mean_token_accuracy": 0.657294899225235,
|
|
"num_tokens": 926431221.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 1.7109375,
|
|
"epoch": 1.3721470548038814,
|
|
"grad_norm": 0.26637270496625215,
|
|
"learning_rate": 1.6186416795829223e-06,
|
|
"loss": 1.7071,
|
|
"mean_token_accuracy": 0.6451487362384796,
|
|
"num_tokens": 927369341.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 1.72578125,
|
|
"epoch": 1.3735137351373514,
|
|
"grad_norm": 0.21909445265738292,
|
|
"learning_rate": 1.6151190643934058e-06,
|
|
"loss": 1.7383,
|
|
"mean_token_accuracy": 0.6424112319946289,
|
|
"num_tokens": 928305868.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.3748804154708214,
|
|
"grad_norm": 0.22022799343086835,
|
|
"learning_rate": 1.6115964492038892e-06,
|
|
"loss": 1.6213,
|
|
"mean_token_accuracy": 0.6627050995826721,
|
|
"num_tokens": 929220902.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.3762470958042914,
|
|
"grad_norm": 0.21379224694517704,
|
|
"learning_rate": 1.6080738340143723e-06,
|
|
"loss": 1.6359,
|
|
"mean_token_accuracy": 0.6564735114574433,
|
|
"num_tokens": 930162608.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.3776137761377614,
|
|
"grad_norm": 0.19185627088250243,
|
|
"learning_rate": 1.6045512188248559e-06,
|
|
"loss": 1.5784,
|
|
"mean_token_accuracy": 0.666518634557724,
|
|
"num_tokens": 931038642.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.3789804564712314,
|
|
"grad_norm": 0.2754643334137231,
|
|
"learning_rate": 1.601028603635339e-06,
|
|
"loss": 1.6251,
|
|
"mean_token_accuracy": 0.6597003996372223,
|
|
"num_tokens": 931936151.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 1.678125,
|
|
"epoch": 1.3803471368047013,
|
|
"grad_norm": 0.18177030848328268,
|
|
"learning_rate": 1.5975059884458225e-06,
|
|
"loss": 1.7028,
|
|
"mean_token_accuracy": 0.6472250163555145,
|
|
"num_tokens": 932901376.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.3817138171381713,
|
|
"grad_norm": 0.1874844260686219,
|
|
"learning_rate": 1.5939833732563056e-06,
|
|
"loss": 1.6402,
|
|
"mean_token_accuracy": 0.6579659163951874,
|
|
"num_tokens": 933829760.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.3830804974716413,
|
|
"grad_norm": 0.17174597439606593,
|
|
"learning_rate": 1.5904607580667888e-06,
|
|
"loss": 1.656,
|
|
"mean_token_accuracy": 0.6542600929737091,
|
|
"num_tokens": 934778601.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.3844471778051113,
|
|
"grad_norm": 0.27775097119065545,
|
|
"learning_rate": 1.5869381428772723e-06,
|
|
"loss": 1.658,
|
|
"mean_token_accuracy": 0.6541840970516205,
|
|
"num_tokens": 935692864.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 1.3858138581385813,
|
|
"grad_norm": 0.1851045481100333,
|
|
"learning_rate": 1.5834155276877554e-06,
|
|
"loss": 1.6678,
|
|
"mean_token_accuracy": 0.6530578970909119,
|
|
"num_tokens": 936596044.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.3871805384720512,
|
|
"grad_norm": 0.2068450215947012,
|
|
"learning_rate": 1.5798929124982388e-06,
|
|
"loss": 1.6557,
|
|
"mean_token_accuracy": 0.6524785995483399,
|
|
"num_tokens": 937480304.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.3885472188055215,
|
|
"grad_norm": 0.23357949542495002,
|
|
"learning_rate": 1.5763702973087221e-06,
|
|
"loss": 1.6739,
|
|
"mean_token_accuracy": 0.651078176498413,
|
|
"num_tokens": 938375460.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.3899138991389914,
|
|
"grad_norm": 0.24577395107432054,
|
|
"learning_rate": 1.5728476821192054e-06,
|
|
"loss": 1.5978,
|
|
"mean_token_accuracy": 0.665805596113205,
|
|
"num_tokens": 939288362.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 1.715625,
|
|
"epoch": 1.3912805794724614,
|
|
"grad_norm": 0.18967887008789547,
|
|
"learning_rate": 1.5693250669296886e-06,
|
|
"loss": 1.7262,
|
|
"mean_token_accuracy": 0.6456466436386108,
|
|
"num_tokens": 940240296.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.3926472598059314,
|
|
"grad_norm": 0.19209135400023133,
|
|
"learning_rate": 1.5658024517401721e-06,
|
|
"loss": 1.588,
|
|
"mean_token_accuracy": 0.6652971386909485,
|
|
"num_tokens": 941160920.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.3940139401394014,
|
|
"grad_norm": 0.1718165190380873,
|
|
"learning_rate": 1.5622798365506552e-06,
|
|
"loss": 1.6053,
|
|
"mean_token_accuracy": 0.6638791263103485,
|
|
"num_tokens": 942103330.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.3953806204728714,
|
|
"grad_norm": 0.1881393748147358,
|
|
"learning_rate": 1.5587572213611388e-06,
|
|
"loss": 1.6299,
|
|
"mean_token_accuracy": 0.6606503605842591,
|
|
"num_tokens": 943025073.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.3967473008063414,
|
|
"grad_norm": 0.17551069110964787,
|
|
"learning_rate": 1.555234606171622e-06,
|
|
"loss": 1.6546,
|
|
"mean_token_accuracy": 0.6561763703823089,
|
|
"num_tokens": 943981278.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 1.69609375,
|
|
"epoch": 1.3981139811398113,
|
|
"grad_norm": 0.21317213765464949,
|
|
"learning_rate": 1.5517119909821052e-06,
|
|
"loss": 1.7098,
|
|
"mean_token_accuracy": 0.6479735970497131,
|
|
"num_tokens": 944931928.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.3994806614732813,
|
|
"grad_norm": 0.25506275387033406,
|
|
"learning_rate": 1.5481893757925886e-06,
|
|
"loss": 1.6272,
|
|
"mean_token_accuracy": 0.6586183547973633,
|
|
"num_tokens": 945881003.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.4008473418067515,
|
|
"grad_norm": 0.1879145422882954,
|
|
"learning_rate": 1.544666760603072e-06,
|
|
"loss": 1.6789,
|
|
"mean_token_accuracy": 0.6492001354694367,
|
|
"num_tokens": 946761027.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.4022140221402215,
|
|
"grad_norm": 0.18920743956979583,
|
|
"learning_rate": 1.541144145413555e-06,
|
|
"loss": 1.6987,
|
|
"mean_token_accuracy": 0.6463018774986267,
|
|
"num_tokens": 947708884.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 1.4035807024736915,
|
|
"grad_norm": 0.21190837633920165,
|
|
"learning_rate": 1.5376215302240386e-06,
|
|
"loss": 1.701,
|
|
"mean_token_accuracy": 0.6491838932037354,
|
|
"num_tokens": 948701710.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.4049473828071615,
|
|
"grad_norm": 0.19080049098537336,
|
|
"learning_rate": 1.5340989150345217e-06,
|
|
"loss": 1.6696,
|
|
"mean_token_accuracy": 0.6528431892395019,
|
|
"num_tokens": 949631378.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 1.53984375,
|
|
"epoch": 1.4063140631406315,
|
|
"grad_norm": 0.1998170777919943,
|
|
"learning_rate": 1.5305762998450048e-06,
|
|
"loss": 1.5597,
|
|
"mean_token_accuracy": 0.6714257180690766,
|
|
"num_tokens": 950542748.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 1.746875,
|
|
"epoch": 1.4076807434741014,
|
|
"grad_norm": 0.19393802263690005,
|
|
"learning_rate": 1.5270536846554884e-06,
|
|
"loss": 1.7575,
|
|
"mean_token_accuracy": 0.6388438582420349,
|
|
"num_tokens": 951462208.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 1.66171875,
|
|
"epoch": 1.4090474238075714,
|
|
"grad_norm": 0.22806651878487733,
|
|
"learning_rate": 1.5235310694659717e-06,
|
|
"loss": 1.6629,
|
|
"mean_token_accuracy": 0.6516242265701294,
|
|
"num_tokens": 952402969.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.4104141041410414,
|
|
"grad_norm": 0.2617556250613641,
|
|
"learning_rate": 1.5200084542764548e-06,
|
|
"loss": 1.5828,
|
|
"mean_token_accuracy": 0.6656874537467956,
|
|
"num_tokens": 953343813.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.4117807844745114,
|
|
"grad_norm": 0.22978956521771884,
|
|
"learning_rate": 1.5164858390869384e-06,
|
|
"loss": 1.6428,
|
|
"mean_token_accuracy": 0.654781949520111,
|
|
"num_tokens": 954265318.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 1.7171875,
|
|
"epoch": 1.4131474648079814,
|
|
"grad_norm": 0.2757902760337423,
|
|
"learning_rate": 1.5129632238974215e-06,
|
|
"loss": 1.728,
|
|
"mean_token_accuracy": 0.6451602697372436,
|
|
"num_tokens": 955158767.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.4145141451414514,
|
|
"grad_norm": 0.26698996667129193,
|
|
"learning_rate": 1.509440608707905e-06,
|
|
"loss": 1.641,
|
|
"mean_token_accuracy": 0.6567740797996521,
|
|
"num_tokens": 956111839.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.4158808254749213,
|
|
"grad_norm": 0.1892088552219068,
|
|
"learning_rate": 1.5059179935183882e-06,
|
|
"loss": 1.68,
|
|
"mean_token_accuracy": 0.6491348385810852,
|
|
"num_tokens": 957026429.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.4172475058083913,
|
|
"grad_norm": 0.27860231936971974,
|
|
"learning_rate": 1.5023953783288713e-06,
|
|
"loss": 1.6096,
|
|
"mean_token_accuracy": 0.6630604863166809,
|
|
"num_tokens": 957964243.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.4186141861418613,
|
|
"grad_norm": 0.1997929878081924,
|
|
"learning_rate": 1.4988727631393549e-06,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.6545568108558655,
|
|
"num_tokens": 958875820.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 1.4199808664753313,
|
|
"grad_norm": 0.2402474738972467,
|
|
"learning_rate": 1.495350147949838e-06,
|
|
"loss": 1.713,
|
|
"mean_token_accuracy": 0.6485125482082367,
|
|
"num_tokens": 959836599.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.4213475468088015,
|
|
"grad_norm": 0.17338332962933337,
|
|
"learning_rate": 1.4918275327603213e-06,
|
|
"loss": 1.6057,
|
|
"mean_token_accuracy": 0.6649423837661743,
|
|
"num_tokens": 960724745.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.4227142271422715,
|
|
"grad_norm": 0.2057776426474246,
|
|
"learning_rate": 1.4883049175708047e-06,
|
|
"loss": 1.6228,
|
|
"mean_token_accuracy": 0.6612998425960541,
|
|
"num_tokens": 961597345.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.4240809074757415,
|
|
"grad_norm": 0.1822709883341973,
|
|
"learning_rate": 1.484782302381288e-06,
|
|
"loss": 1.6403,
|
|
"mean_token_accuracy": 0.6554524958133697,
|
|
"num_tokens": 962519557.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 1.65703125,
|
|
"epoch": 1.4254475878092114,
|
|
"grad_norm": 0.18687251314021155,
|
|
"learning_rate": 1.4812596871917711e-06,
|
|
"loss": 1.6642,
|
|
"mean_token_accuracy": 0.6556366860866547,
|
|
"num_tokens": 963423510.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.4268142681426814,
|
|
"grad_norm": 0.18536405879761356,
|
|
"learning_rate": 1.4777370720022547e-06,
|
|
"loss": 1.6076,
|
|
"mean_token_accuracy": 0.6621335327625275,
|
|
"num_tokens": 964342013.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.4281809484761514,
|
|
"grad_norm": 0.27169291691110353,
|
|
"learning_rate": 1.4742144568127378e-06,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.656197440624237,
|
|
"num_tokens": 965292201.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.4295476288096214,
|
|
"grad_norm": 0.21278729085361006,
|
|
"learning_rate": 1.4706918416232213e-06,
|
|
"loss": 1.5803,
|
|
"mean_token_accuracy": 0.6686306655406952,
|
|
"num_tokens": 966213561.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 1.70078125,
|
|
"epoch": 1.4309143091430914,
|
|
"grad_norm": 0.20135353623771918,
|
|
"learning_rate": 1.4671692264337045e-06,
|
|
"loss": 1.704,
|
|
"mean_token_accuracy": 0.6495081305503845,
|
|
"num_tokens": 967164285.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.4322809894765614,
|
|
"grad_norm": 0.22608654556626173,
|
|
"learning_rate": 1.4636466112441878e-06,
|
|
"loss": 1.6669,
|
|
"mean_token_accuracy": 0.6507824003696442,
|
|
"num_tokens": 968113317.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.4336476698100316,
|
|
"grad_norm": 0.26968369491065936,
|
|
"learning_rate": 1.4601239960546711e-06,
|
|
"loss": 1.6329,
|
|
"mean_token_accuracy": 0.6579782485961914,
|
|
"num_tokens": 969035603.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 1.7140625,
|
|
"epoch": 1.4350143501435015,
|
|
"grad_norm": 0.19763221420726182,
|
|
"learning_rate": 1.4566013808651545e-06,
|
|
"loss": 1.7114,
|
|
"mean_token_accuracy": 0.6466321468353271,
|
|
"num_tokens": 970003896.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.4363810304769715,
|
|
"grad_norm": 0.2089251936345775,
|
|
"learning_rate": 1.4530787656756376e-06,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.66223925948143,
|
|
"num_tokens": 970964132.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 1.68828125,
|
|
"epoch": 1.4377477108104415,
|
|
"grad_norm": 0.1822905048786221,
|
|
"learning_rate": 1.4495561504861211e-06,
|
|
"loss": 1.6831,
|
|
"mean_token_accuracy": 0.6502669751644135,
|
|
"num_tokens": 971937851.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.4391143911439115,
|
|
"grad_norm": 0.18713401409522315,
|
|
"learning_rate": 1.4460335352966043e-06,
|
|
"loss": 1.6171,
|
|
"mean_token_accuracy": 0.6619602978229523,
|
|
"num_tokens": 972855658.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 1.73046875,
|
|
"epoch": 1.4404810714773815,
|
|
"grad_norm": 0.17697620015573573,
|
|
"learning_rate": 1.4425109201070876e-06,
|
|
"loss": 1.732,
|
|
"mean_token_accuracy": 0.6412524342536926,
|
|
"num_tokens": 973862404.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.4418477518108515,
|
|
"grad_norm": 0.19258012492497414,
|
|
"learning_rate": 1.438988304917571e-06,
|
|
"loss": 1.636,
|
|
"mean_token_accuracy": 0.6559379696846008,
|
|
"num_tokens": 974806955.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 1.4432144321443214,
|
|
"grad_norm": 0.461063856812337,
|
|
"learning_rate": 1.4354656897280543e-06,
|
|
"loss": 1.6519,
|
|
"mean_token_accuracy": 0.6545793294906617,
|
|
"num_tokens": 975738964.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.4445811124777914,
|
|
"grad_norm": 0.22024830442925852,
|
|
"learning_rate": 1.4319430745385376e-06,
|
|
"loss": 1.6458,
|
|
"mean_token_accuracy": 0.6551294326782227,
|
|
"num_tokens": 976654660.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.4459477928112614,
|
|
"grad_norm": 0.2131753576822763,
|
|
"learning_rate": 1.428420459349021e-06,
|
|
"loss": 1.6374,
|
|
"mean_token_accuracy": 0.6586807370185852,
|
|
"num_tokens": 977575312.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 1.55234375,
|
|
"epoch": 1.4473144731447314,
|
|
"grad_norm": 0.2503650511784387,
|
|
"learning_rate": 1.424897844159504e-06,
|
|
"loss": 1.5738,
|
|
"mean_token_accuracy": 0.6683693587779999,
|
|
"num_tokens": 978500444.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.4486811534782014,
|
|
"grad_norm": 0.32172443077549645,
|
|
"learning_rate": 1.4213752289699876e-06,
|
|
"loss": 1.6077,
|
|
"mean_token_accuracy": 0.6617006123065948,
|
|
"num_tokens": 979405955.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.4500478338116713,
|
|
"grad_norm": 0.20375632315764228,
|
|
"learning_rate": 1.4178526137804707e-06,
|
|
"loss": 1.5499,
|
|
"mean_token_accuracy": 0.6718333065509796,
|
|
"num_tokens": 980315786.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.4514145141451413,
|
|
"grad_norm": 0.24234385482368884,
|
|
"learning_rate": 1.4143299985909538e-06,
|
|
"loss": 1.5741,
|
|
"mean_token_accuracy": 0.6686679303646088,
|
|
"num_tokens": 981218106.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 1.58984375,
|
|
"epoch": 1.4527811944786113,
|
|
"grad_norm": 0.19299438644896574,
|
|
"learning_rate": 1.4108073834014374e-06,
|
|
"loss": 1.5786,
|
|
"mean_token_accuracy": 0.6666796207427979,
|
|
"num_tokens": 982174484.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.4541478748120815,
|
|
"grad_norm": 0.1953568818224216,
|
|
"learning_rate": 1.4072847682119205e-06,
|
|
"loss": 1.5874,
|
|
"mean_token_accuracy": 0.6652349412441254,
|
|
"num_tokens": 983122346.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 1.4555145551455515,
|
|
"grad_norm": 0.2115941555618993,
|
|
"learning_rate": 1.4037621530224039e-06,
|
|
"loss": 1.6772,
|
|
"mean_token_accuracy": 0.653554481267929,
|
|
"num_tokens": 984102946.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.4568812354790215,
|
|
"grad_norm": 0.2904153229134878,
|
|
"learning_rate": 1.4002395378328872e-06,
|
|
"loss": 1.6273,
|
|
"mean_token_accuracy": 0.6576791048049927,
|
|
"num_tokens": 985061607.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.4582479158124915,
|
|
"grad_norm": 0.6115158379450263,
|
|
"learning_rate": 1.3967169226433705e-06,
|
|
"loss": 1.5801,
|
|
"mean_token_accuracy": 0.6677203059196473,
|
|
"num_tokens": 985985807.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.4596145961459615,
|
|
"grad_norm": 0.21092583776296614,
|
|
"learning_rate": 1.393194307453854e-06,
|
|
"loss": 1.6677,
|
|
"mean_token_accuracy": 0.6534775018692016,
|
|
"num_tokens": 986926889.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.4609812764794314,
|
|
"grad_norm": 0.1735715133848591,
|
|
"learning_rate": 1.3896716922643372e-06,
|
|
"loss": 1.6646,
|
|
"mean_token_accuracy": 0.6522797644138336,
|
|
"num_tokens": 987829666.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.4623479568129014,
|
|
"grad_norm": 0.21607721741096012,
|
|
"learning_rate": 1.3861490770748203e-06,
|
|
"loss": 1.6072,
|
|
"mean_token_accuracy": 0.6635283648967742,
|
|
"num_tokens": 988775294.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.4637146371463714,
|
|
"grad_norm": 0.24270114219344818,
|
|
"learning_rate": 1.3826264618853039e-06,
|
|
"loss": 1.6401,
|
|
"mean_token_accuracy": 0.6541480183601379,
|
|
"num_tokens": 989650475.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 1.4650813174798414,
|
|
"grad_norm": 0.2117536587152192,
|
|
"learning_rate": 1.379103846695787e-06,
|
|
"loss": 1.6931,
|
|
"mean_token_accuracy": 0.6495812892913818,
|
|
"num_tokens": 990641968.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.4664479978133116,
|
|
"grad_norm": 0.22368305595066587,
|
|
"learning_rate": 1.3755812315062703e-06,
|
|
"loss": 1.6573,
|
|
"mean_token_accuracy": 0.6552225172519683,
|
|
"num_tokens": 991559497.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.4678146781467816,
|
|
"grad_norm": 0.2512616671332665,
|
|
"learning_rate": 1.3720586163167537e-06,
|
|
"loss": 1.6365,
|
|
"mean_token_accuracy": 0.6562442302703857,
|
|
"num_tokens": 992481920.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.4691813584802516,
|
|
"grad_norm": 0.22039101267724218,
|
|
"learning_rate": 1.368536001127237e-06,
|
|
"loss": 1.623,
|
|
"mean_token_accuracy": 0.6602308630943299,
|
|
"num_tokens": 993380400.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.4705480388137215,
|
|
"grad_norm": 0.18031254775417893,
|
|
"learning_rate": 1.3650133859377201e-06,
|
|
"loss": 1.6159,
|
|
"mean_token_accuracy": 0.6580818295478821,
|
|
"num_tokens": 994290951.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 1.65,
|
|
"epoch": 1.4719147191471915,
|
|
"grad_norm": 0.17486411223367967,
|
|
"learning_rate": 1.3614907707482037e-06,
|
|
"loss": 1.6434,
|
|
"mean_token_accuracy": 0.6574078917503356,
|
|
"num_tokens": 995193039.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 1.74296875,
|
|
"epoch": 1.4732813994806615,
|
|
"grad_norm": 0.17436811123731502,
|
|
"learning_rate": 1.3579681555586868e-06,
|
|
"loss": 1.736,
|
|
"mean_token_accuracy": 0.6441664278507233,
|
|
"num_tokens": 996146367.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.4746480798141315,
|
|
"grad_norm": 0.1895885768918216,
|
|
"learning_rate": 1.3544455403691703e-06,
|
|
"loss": 1.6595,
|
|
"mean_token_accuracy": 0.6540235638618469,
|
|
"num_tokens": 997097425.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 1.5984375,
|
|
"epoch": 1.4760147601476015,
|
|
"grad_norm": 0.2132512422239411,
|
|
"learning_rate": 1.3509229251796535e-06,
|
|
"loss": 1.5816,
|
|
"mean_token_accuracy": 0.6659575343132019,
|
|
"num_tokens": 997982427.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.4773814404810715,
|
|
"grad_norm": 0.21850200359765629,
|
|
"learning_rate": 1.3474003099901368e-06,
|
|
"loss": 1.6734,
|
|
"mean_token_accuracy": 0.6531930923461914,
|
|
"num_tokens": 998877987.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.4787481208145414,
|
|
"grad_norm": 0.2918332607885295,
|
|
"learning_rate": 1.3438776948006201e-06,
|
|
"loss": 1.6237,
|
|
"mean_token_accuracy": 0.6619933485984802,
|
|
"num_tokens": 999784022.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.4801148011480114,
|
|
"grad_norm": 0.19126076442969342,
|
|
"learning_rate": 1.3403550796111035e-06,
|
|
"loss": 1.616,
|
|
"mean_token_accuracy": 0.662722247838974,
|
|
"num_tokens": 1000711640.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.4814814814814814,
|
|
"grad_norm": 0.22578209573170197,
|
|
"learning_rate": 1.3368324644215866e-06,
|
|
"loss": 1.6473,
|
|
"mean_token_accuracy": 0.656601470708847,
|
|
"num_tokens": 1001634583.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 1.5734375,
|
|
"epoch": 1.4828481618149514,
|
|
"grad_norm": 0.1938170089779734,
|
|
"learning_rate": 1.3333098492320701e-06,
|
|
"loss": 1.5761,
|
|
"mean_token_accuracy": 0.665473359823227,
|
|
"num_tokens": 1002569965.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 1.68984375,
|
|
"epoch": 1.4842148421484214,
|
|
"grad_norm": 0.3192456211625434,
|
|
"learning_rate": 1.3297872340425533e-06,
|
|
"loss": 1.7019,
|
|
"mean_token_accuracy": 0.6468794345855713,
|
|
"num_tokens": 1003462518.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 1.4855815224818913,
|
|
"grad_norm": 0.20866536395392718,
|
|
"learning_rate": 1.3262646188530364e-06,
|
|
"loss": 1.7316,
|
|
"mean_token_accuracy": 0.6448964893817901,
|
|
"num_tokens": 1004430622.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.4869482028153616,
|
|
"grad_norm": 0.2183997337673435,
|
|
"learning_rate": 1.32274200366352e-06,
|
|
"loss": 1.6035,
|
|
"mean_token_accuracy": 0.6642241716384888,
|
|
"num_tokens": 1005359033.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 1.58203125,
|
|
"epoch": 1.4883148831488315,
|
|
"grad_norm": 0.2345879334240788,
|
|
"learning_rate": 1.319219388474003e-06,
|
|
"loss": 1.5841,
|
|
"mean_token_accuracy": 0.667106819152832,
|
|
"num_tokens": 1006238835.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 1.6625,
|
|
"epoch": 1.4896815634823015,
|
|
"grad_norm": 0.26663546617720657,
|
|
"learning_rate": 1.3156967732844866e-06,
|
|
"loss": 1.6756,
|
|
"mean_token_accuracy": 0.6506954491138458,
|
|
"num_tokens": 1007172706.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 1.58125,
|
|
"epoch": 1.4910482438157715,
|
|
"grad_norm": 0.18443027689184044,
|
|
"learning_rate": 1.31217415809497e-06,
|
|
"loss": 1.5809,
|
|
"mean_token_accuracy": 0.6661109924316406,
|
|
"num_tokens": 1008120807.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 1.70703125,
|
|
"epoch": 1.4924149241492415,
|
|
"grad_norm": 0.21838808658019074,
|
|
"learning_rate": 1.308651542905453e-06,
|
|
"loss": 1.7409,
|
|
"mean_token_accuracy": 0.6395205855369568,
|
|
"num_tokens": 1009095174.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 1.59921875,
|
|
"epoch": 1.4937816044827115,
|
|
"grad_norm": 0.1988001608940964,
|
|
"learning_rate": 1.3051289277159366e-06,
|
|
"loss": 1.6034,
|
|
"mean_token_accuracy": 0.6614027738571167,
|
|
"num_tokens": 1009999058.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 1.56796875,
|
|
"epoch": 1.4951482848161814,
|
|
"grad_norm": 0.38567744596043463,
|
|
"learning_rate": 1.3016063125264197e-06,
|
|
"loss": 1.5767,
|
|
"mean_token_accuracy": 0.6678137481212616,
|
|
"num_tokens": 1010904055.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 1.69375,
|
|
"epoch": 1.4965149651496514,
|
|
"grad_norm": 0.2159574022538828,
|
|
"learning_rate": 1.2980836973369029e-06,
|
|
"loss": 1.6853,
|
|
"mean_token_accuracy": 0.6476708173751831,
|
|
"num_tokens": 1011794287.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.4978816454831214,
|
|
"grad_norm": 0.1769779394348502,
|
|
"learning_rate": 1.2945610821473864e-06,
|
|
"loss": 1.5946,
|
|
"mean_token_accuracy": 0.6654948055744171,
|
|
"num_tokens": 1012684163.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.4992483258165916,
|
|
"grad_norm": 0.19343426665186908,
|
|
"learning_rate": 1.2910384669578695e-06,
|
|
"loss": 1.6047,
|
|
"mean_token_accuracy": 0.6652138352394104,
|
|
"num_tokens": 1013615463.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.5006150061500616,
|
|
"grad_norm": 0.22060414749540516,
|
|
"learning_rate": 1.2875158517683529e-06,
|
|
"loss": 1.6083,
|
|
"mean_token_accuracy": 0.6633041381835938,
|
|
"num_tokens": 1014513226.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 1.67890625,
|
|
"epoch": 1.5019816864835316,
|
|
"grad_norm": 0.2807161886990323,
|
|
"learning_rate": 1.2839932365788362e-06,
|
|
"loss": 1.6877,
|
|
"mean_token_accuracy": 0.6494120359420776,
|
|
"num_tokens": 1015433714.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 1.61953125,
|
|
"epoch": 1.5033483668170016,
|
|
"grad_norm": 0.22323180669890208,
|
|
"learning_rate": 1.2804706213893195e-06,
|
|
"loss": 1.6245,
|
|
"mean_token_accuracy": 0.6593492269515991,
|
|
"num_tokens": 1016371913.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.5047150471504716,
|
|
"grad_norm": 0.27168455457837093,
|
|
"learning_rate": 1.2769480061998029e-06,
|
|
"loss": 1.6788,
|
|
"mean_token_accuracy": 0.6525617897510528,
|
|
"num_tokens": 1017305199.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.5060817274839415,
|
|
"grad_norm": 0.1982095540855432,
|
|
"learning_rate": 1.2734253910102862e-06,
|
|
"loss": 1.6181,
|
|
"mean_token_accuracy": 0.6620503306388855,
|
|
"num_tokens": 1018269575.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.5074484078174115,
|
|
"grad_norm": 0.17036612190382497,
|
|
"learning_rate": 1.2699027758207693e-06,
|
|
"loss": 1.6393,
|
|
"mean_token_accuracy": 0.6568759560585022,
|
|
"num_tokens": 1019189121.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 1.728125,
|
|
"epoch": 1.5088150881508815,
|
|
"grad_norm": 0.24767140630738158,
|
|
"learning_rate": 1.2663801606312529e-06,
|
|
"loss": 1.7459,
|
|
"mean_token_accuracy": 0.6360925793647766,
|
|
"num_tokens": 1020170250.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 1.56484375,
|
|
"epoch": 1.5101817684843515,
|
|
"grad_norm": 0.20201614729272216,
|
|
"learning_rate": 1.262857545441736e-06,
|
|
"loss": 1.5627,
|
|
"mean_token_accuracy": 0.6700040578842164,
|
|
"num_tokens": 1021058238.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.5115484488178215,
|
|
"grad_norm": 0.2382752473057111,
|
|
"learning_rate": 1.2593349302522193e-06,
|
|
"loss": 1.6459,
|
|
"mean_token_accuracy": 0.6547189176082611,
|
|
"num_tokens": 1021971104.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 1.68046875,
|
|
"epoch": 1.5129151291512914,
|
|
"grad_norm": 0.20666621894192233,
|
|
"learning_rate": 1.2558123150627027e-06,
|
|
"loss": 1.7075,
|
|
"mean_token_accuracy": 0.6460173189640045,
|
|
"num_tokens": 1022905419.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.5142818094847614,
|
|
"grad_norm": 0.18668676557887823,
|
|
"learning_rate": 1.252289699873186e-06,
|
|
"loss": 1.7093,
|
|
"mean_token_accuracy": 0.6434547245502472,
|
|
"num_tokens": 1023833216.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 1.6578125,
|
|
"epoch": 1.5156484898182314,
|
|
"grad_norm": 0.2208478159677512,
|
|
"learning_rate": 1.2487670846836693e-06,
|
|
"loss": 1.6629,
|
|
"mean_token_accuracy": 0.65260608792305,
|
|
"num_tokens": 1024771488.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 1.76328125,
|
|
"epoch": 1.5170151701517014,
|
|
"grad_norm": 0.27055383666719574,
|
|
"learning_rate": 1.2452444694941527e-06,
|
|
"loss": 1.771,
|
|
"mean_token_accuracy": 0.6361504316329956,
|
|
"num_tokens": 1025658591.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.5183818504851714,
|
|
"grad_norm": 0.18629524423261323,
|
|
"learning_rate": 1.2417218543046358e-06,
|
|
"loss": 1.6191,
|
|
"mean_token_accuracy": 0.6605481445789337,
|
|
"num_tokens": 1026589579.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.5197485308186414,
|
|
"grad_norm": 0.19457279550025783,
|
|
"learning_rate": 1.2381992391151191e-06,
|
|
"loss": 1.6035,
|
|
"mean_token_accuracy": 0.6649640560150146,
|
|
"num_tokens": 1027502306.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.5211152111521116,
|
|
"grad_norm": 0.258166956803373,
|
|
"learning_rate": 1.2346766239256025e-06,
|
|
"loss": 1.6395,
|
|
"mean_token_accuracy": 0.654163408279419,
|
|
"num_tokens": 1028450634.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.5224818914855816,
|
|
"grad_norm": 0.2023072514698911,
|
|
"learning_rate": 1.2311540087360858e-06,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.6573539793491363,
|
|
"num_tokens": 1029413490.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.5238485718190515,
|
|
"grad_norm": 0.19048133445418125,
|
|
"learning_rate": 1.2276313935465691e-06,
|
|
"loss": 1.5739,
|
|
"mean_token_accuracy": 0.6686464369297027,
|
|
"num_tokens": 1030335180.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 1.5252152521525215,
|
|
"grad_norm": 0.32055625924300046,
|
|
"learning_rate": 1.2241087783570525e-06,
|
|
"loss": 1.6134,
|
|
"mean_token_accuracy": 0.6605431318283081,
|
|
"num_tokens": 1031263059.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.5265819324859915,
|
|
"grad_norm": 0.19415298936058525,
|
|
"learning_rate": 1.2205861631675358e-06,
|
|
"loss": 1.6352,
|
|
"mean_token_accuracy": 0.6560705542564392,
|
|
"num_tokens": 1032195653.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 1.69765625,
|
|
"epoch": 1.5279486128194615,
|
|
"grad_norm": 0.23273716506654066,
|
|
"learning_rate": 1.217063547978019e-06,
|
|
"loss": 1.7008,
|
|
"mean_token_accuracy": 0.6454036295413971,
|
|
"num_tokens": 1033105876.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 1.62421875,
|
|
"epoch": 1.5293152931529317,
|
|
"grad_norm": 0.2504789107680329,
|
|
"learning_rate": 1.2135409327885023e-06,
|
|
"loss": 1.6283,
|
|
"mean_token_accuracy": 0.6597443640232086,
|
|
"num_tokens": 1034023825.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.5306819734864017,
|
|
"grad_norm": 0.19049141707416442,
|
|
"learning_rate": 1.2100183175989856e-06,
|
|
"loss": 1.6084,
|
|
"mean_token_accuracy": 0.6619674503803253,
|
|
"num_tokens": 1034924050.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 1.5320486538198717,
|
|
"grad_norm": 0.1785046219755286,
|
|
"learning_rate": 1.206495702409469e-06,
|
|
"loss": 1.5652,
|
|
"mean_token_accuracy": 0.6702347755432129,
|
|
"num_tokens": 1035830053.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 1.55859375,
|
|
"epoch": 1.5334153341533416,
|
|
"grad_norm": 0.2546747925354842,
|
|
"learning_rate": 1.202973087219952e-06,
|
|
"loss": 1.5632,
|
|
"mean_token_accuracy": 0.6663195133209229,
|
|
"num_tokens": 1036752025.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.5347820144868116,
|
|
"grad_norm": 0.3178938997929608,
|
|
"learning_rate": 1.1994504720304354e-06,
|
|
"loss": 1.6439,
|
|
"mean_token_accuracy": 0.6558451890945435,
|
|
"num_tokens": 1037699942.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.5361486948202816,
|
|
"grad_norm": 0.21415413798261446,
|
|
"learning_rate": 1.1959278568409187e-06,
|
|
"loss": 1.6312,
|
|
"mean_token_accuracy": 0.656808465719223,
|
|
"num_tokens": 1038642434.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 1.57734375,
|
|
"epoch": 1.5375153751537516,
|
|
"grad_norm": 0.19882487769500146,
|
|
"learning_rate": 1.192405241651402e-06,
|
|
"loss": 1.5737,
|
|
"mean_token_accuracy": 0.6648223340511322,
|
|
"num_tokens": 1039564462.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.5388820554872216,
|
|
"grad_norm": 0.25591825797891854,
|
|
"learning_rate": 1.1888826264618854e-06,
|
|
"loss": 1.6095,
|
|
"mean_token_accuracy": 0.6621817052364349,
|
|
"num_tokens": 1040481300.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.5402487358206916,
|
|
"grad_norm": 0.3245133419487537,
|
|
"learning_rate": 1.1853600112723687e-06,
|
|
"loss": 1.6412,
|
|
"mean_token_accuracy": 0.6569789290428162,
|
|
"num_tokens": 1041421627.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.5416154161541615,
|
|
"grad_norm": 0.17161900139796854,
|
|
"learning_rate": 1.181837396082852e-06,
|
|
"loss": 1.6082,
|
|
"mean_token_accuracy": 0.6636178195476532,
|
|
"num_tokens": 1042361786.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.5429820964876315,
|
|
"grad_norm": 0.23868600658249997,
|
|
"learning_rate": 1.1783147808933352e-06,
|
|
"loss": 1.6074,
|
|
"mean_token_accuracy": 0.66429323554039,
|
|
"num_tokens": 1043298571.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 1.5443487768211015,
|
|
"grad_norm": 0.29134300944631997,
|
|
"learning_rate": 1.1747921657038185e-06,
|
|
"loss": 1.7483,
|
|
"mean_token_accuracy": 0.638844782114029,
|
|
"num_tokens": 1044220087.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.5457154571545715,
|
|
"grad_norm": 0.3274225968886461,
|
|
"learning_rate": 1.1712695505143019e-06,
|
|
"loss": 1.5631,
|
|
"mean_token_accuracy": 0.6672748386859894,
|
|
"num_tokens": 1045114501.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.5470821374880415,
|
|
"grad_norm": 0.21734445859540624,
|
|
"learning_rate": 1.1677469353247852e-06,
|
|
"loss": 1.6446,
|
|
"mean_token_accuracy": 0.6554768741130829,
|
|
"num_tokens": 1045981143.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.5484488178215114,
|
|
"grad_norm": 0.2531375344157984,
|
|
"learning_rate": 1.1642243201352685e-06,
|
|
"loss": 1.6573,
|
|
"mean_token_accuracy": 0.6525388777256012,
|
|
"num_tokens": 1046936764.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 1.60625,
|
|
"epoch": 1.5498154981549814,
|
|
"grad_norm": 0.23287262334980888,
|
|
"learning_rate": 1.1607017049457519e-06,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6596551418304444,
|
|
"num_tokens": 1047875776.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.5511821784884514,
|
|
"grad_norm": 0.20815970074805107,
|
|
"learning_rate": 1.1571790897562352e-06,
|
|
"loss": 1.6328,
|
|
"mean_token_accuracy": 0.6578056752681732,
|
|
"num_tokens": 1048776451.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.5525488588219214,
|
|
"grad_norm": 0.201163498894995,
|
|
"learning_rate": 1.1536564745667183e-06,
|
|
"loss": 1.5793,
|
|
"mean_token_accuracy": 0.6678462862968445,
|
|
"num_tokens": 1049687530.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.5539155391553916,
|
|
"grad_norm": 0.22794196512435722,
|
|
"learning_rate": 1.1501338593772017e-06,
|
|
"loss": 1.6149,
|
|
"mean_token_accuracy": 0.6597362279891967,
|
|
"num_tokens": 1050593880.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.5552822194888616,
|
|
"grad_norm": 0.24811404195709424,
|
|
"learning_rate": 1.146611244187685e-06,
|
|
"loss": 1.6779,
|
|
"mean_token_accuracy": 0.6547135174274444,
|
|
"num_tokens": 1051545923.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 1.6640625,
|
|
"epoch": 1.5566488998223316,
|
|
"grad_norm": 0.19467945247868093,
|
|
"learning_rate": 1.1430886289981683e-06,
|
|
"loss": 1.6718,
|
|
"mean_token_accuracy": 0.6491936147212982,
|
|
"num_tokens": 1052463775.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 1.54765625,
|
|
"epoch": 1.5580155801558015,
|
|
"grad_norm": 0.33037197105209676,
|
|
"learning_rate": 1.1395660138086517e-06,
|
|
"loss": 1.5301,
|
|
"mean_token_accuracy": 0.6737705588340759,
|
|
"num_tokens": 1053333690.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.5593822604892715,
|
|
"grad_norm": 0.18818106651550057,
|
|
"learning_rate": 1.136043398619135e-06,
|
|
"loss": 1.5982,
|
|
"mean_token_accuracy": 0.6661373138427734,
|
|
"num_tokens": 1054252019.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 1.7015625,
|
|
"epoch": 1.5607489408227415,
|
|
"grad_norm": 0.16530015016612873,
|
|
"learning_rate": 1.1325207834296184e-06,
|
|
"loss": 1.7,
|
|
"mean_token_accuracy": 0.6473625004291534,
|
|
"num_tokens": 1055187638.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.5621156211562117,
|
|
"grad_norm": 0.2424824753735191,
|
|
"learning_rate": 1.1289981682401017e-06,
|
|
"loss": 1.6571,
|
|
"mean_token_accuracy": 0.6556045174598694,
|
|
"num_tokens": 1056074177.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.5634823014896817,
|
|
"grad_norm": 0.21001097252714537,
|
|
"learning_rate": 1.1254755530505848e-06,
|
|
"loss": 1.599,
|
|
"mean_token_accuracy": 0.6633764922618866,
|
|
"num_tokens": 1056998685.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.5648489818231517,
|
|
"grad_norm": 0.25956620170657213,
|
|
"learning_rate": 1.1219529378610681e-06,
|
|
"loss": 1.6792,
|
|
"mean_token_accuracy": 0.6492401182651519,
|
|
"num_tokens": 1057917935.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.5662156621566217,
|
|
"grad_norm": 0.2033269170462861,
|
|
"learning_rate": 1.1184303226715515e-06,
|
|
"loss": 1.6386,
|
|
"mean_token_accuracy": 0.6576379179954529,
|
|
"num_tokens": 1058859092.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.5675823424900917,
|
|
"grad_norm": 0.2037034516396853,
|
|
"learning_rate": 1.1149077074820346e-06,
|
|
"loss": 1.6658,
|
|
"mean_token_accuracy": 0.6514975130558014,
|
|
"num_tokens": 1059795446.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 1.67109375,
|
|
"epoch": 1.5689490228235616,
|
|
"grad_norm": 0.2443241194339983,
|
|
"learning_rate": 1.111385092292518e-06,
|
|
"loss": 1.6643,
|
|
"mean_token_accuracy": 0.6532392382621766,
|
|
"num_tokens": 1060705820.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 1.58125,
|
|
"epoch": 1.5703157031570316,
|
|
"grad_norm": 0.2126398022593399,
|
|
"learning_rate": 1.1078624771030013e-06,
|
|
"loss": 1.5978,
|
|
"mean_token_accuracy": 0.662891560792923,
|
|
"num_tokens": 1061629657.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.5716823834905016,
|
|
"grad_norm": 0.211041813369812,
|
|
"learning_rate": 1.1043398619134846e-06,
|
|
"loss": 1.6883,
|
|
"mean_token_accuracy": 0.6481752932071686,
|
|
"num_tokens": 1062588826.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.5730490638239716,
|
|
"grad_norm": 0.2582392383818017,
|
|
"learning_rate": 1.100817246723968e-06,
|
|
"loss": 1.6546,
|
|
"mean_token_accuracy": 0.6543661475181579,
|
|
"num_tokens": 1063495059.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.5744157441574416,
|
|
"grad_norm": 0.20924200877912766,
|
|
"learning_rate": 1.0972946315344513e-06,
|
|
"loss": 1.6741,
|
|
"mean_token_accuracy": 0.6530385196208954,
|
|
"num_tokens": 1064451323.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 1.5875,
|
|
"epoch": 1.5757824244909115,
|
|
"grad_norm": 0.20043919412193115,
|
|
"learning_rate": 1.0937720163449346e-06,
|
|
"loss": 1.6053,
|
|
"mean_token_accuracy": 0.664458590745926,
|
|
"num_tokens": 1065364727.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.5771491048243815,
|
|
"grad_norm": 0.4751501475959469,
|
|
"learning_rate": 1.0902494011554177e-06,
|
|
"loss": 1.6441,
|
|
"mean_token_accuracy": 0.6523413836956025,
|
|
"num_tokens": 1066297689.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 1.62265625,
|
|
"epoch": 1.5785157851578515,
|
|
"grad_norm": 0.19228596857424923,
|
|
"learning_rate": 1.086726785965901e-06,
|
|
"loss": 1.6294,
|
|
"mean_token_accuracy": 0.6600920140743256,
|
|
"num_tokens": 1067268095.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 1.64453125,
|
|
"epoch": 1.5798824654913215,
|
|
"grad_norm": 0.2207202268347491,
|
|
"learning_rate": 1.0832041707763844e-06,
|
|
"loss": 1.6451,
|
|
"mean_token_accuracy": 0.6567894220352173,
|
|
"num_tokens": 1068201708.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.5812491458247915,
|
|
"grad_norm": 0.2080597584268558,
|
|
"learning_rate": 1.0796815555868678e-06,
|
|
"loss": 1.6546,
|
|
"mean_token_accuracy": 0.655974018573761,
|
|
"num_tokens": 1069192024.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 1.69296875,
|
|
"epoch": 1.5826158261582615,
|
|
"grad_norm": 0.18984173316221195,
|
|
"learning_rate": 1.076158940397351e-06,
|
|
"loss": 1.7104,
|
|
"mean_token_accuracy": 0.6459042072296143,
|
|
"num_tokens": 1070119260.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.5839825064917314,
|
|
"grad_norm": 0.24026157566913356,
|
|
"learning_rate": 1.0726363252078344e-06,
|
|
"loss": 1.5876,
|
|
"mean_token_accuracy": 0.6662364661693573,
|
|
"num_tokens": 1071017885.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 1.5853491868252014,
|
|
"grad_norm": 0.31022540712438146,
|
|
"learning_rate": 1.0691137100183178e-06,
|
|
"loss": 1.5877,
|
|
"mean_token_accuracy": 0.6682347774505615,
|
|
"num_tokens": 1071931187.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.5867158671586716,
|
|
"grad_norm": 0.21960506423867315,
|
|
"learning_rate": 1.065591094828801e-06,
|
|
"loss": 1.631,
|
|
"mean_token_accuracy": 0.6559723854064942,
|
|
"num_tokens": 1072901155.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.5880825474921416,
|
|
"grad_norm": 0.21559244640023045,
|
|
"learning_rate": 1.0620684796392842e-06,
|
|
"loss": 1.6574,
|
|
"mean_token_accuracy": 0.6534350872039795,
|
|
"num_tokens": 1073768648.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.5894492278256116,
|
|
"grad_norm": 0.21247433037927194,
|
|
"learning_rate": 1.0585458644497676e-06,
|
|
"loss": 1.5403,
|
|
"mean_token_accuracy": 0.6717761337757111,
|
|
"num_tokens": 1074652219.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.5908159081590816,
|
|
"grad_norm": 0.23015301601060667,
|
|
"learning_rate": 1.0550232492602509e-06,
|
|
"loss": 1.6664,
|
|
"mean_token_accuracy": 0.6516958594322204,
|
|
"num_tokens": 1075557890.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.5921825884925516,
|
|
"grad_norm": 0.17719497416302143,
|
|
"learning_rate": 1.0515006340707342e-06,
|
|
"loss": 1.6385,
|
|
"mean_token_accuracy": 0.6585043966770172,
|
|
"num_tokens": 1076466593.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 1.53671875,
|
|
"epoch": 1.5935492688260215,
|
|
"grad_norm": 0.20036427877460256,
|
|
"learning_rate": 1.0479780188812176e-06,
|
|
"loss": 1.5414,
|
|
"mean_token_accuracy": 0.6738483726978302,
|
|
"num_tokens": 1077346158.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.5949159491594918,
|
|
"grad_norm": 0.4447658237427549,
|
|
"learning_rate": 1.0444554036917009e-06,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.6529668092727661,
|
|
"num_tokens": 1078280883.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 1.70390625,
|
|
"epoch": 1.5962826294929617,
|
|
"grad_norm": 0.19304392022836914,
|
|
"learning_rate": 1.0409327885021842e-06,
|
|
"loss": 1.7204,
|
|
"mean_token_accuracy": 0.6424146056175232,
|
|
"num_tokens": 1079222170.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.5976493098264317,
|
|
"grad_norm": 0.3687531387741987,
|
|
"learning_rate": 1.0374101733126674e-06,
|
|
"loss": 1.6005,
|
|
"mean_token_accuracy": 0.663310444355011,
|
|
"num_tokens": 1080146536.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 1.55078125,
|
|
"epoch": 1.5990159901599017,
|
|
"grad_norm": 0.16142672363294366,
|
|
"learning_rate": 1.0338875581231507e-06,
|
|
"loss": 1.5499,
|
|
"mean_token_accuracy": 0.6708192050457,
|
|
"num_tokens": 1081015044.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 1.6003826704933717,
|
|
"grad_norm": 0.25057421145099223,
|
|
"learning_rate": 1.030364942933634e-06,
|
|
"loss": 1.7204,
|
|
"mean_token_accuracy": 0.6440162003040314,
|
|
"num_tokens": 1081909054.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.6017493508268417,
|
|
"grad_norm": 0.2796861508342579,
|
|
"learning_rate": 1.0268423277441174e-06,
|
|
"loss": 1.6467,
|
|
"mean_token_accuracy": 0.6561101913452149,
|
|
"num_tokens": 1082806118.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.6031160311603116,
|
|
"grad_norm": 0.2326314024344254,
|
|
"learning_rate": 1.0233197125546005e-06,
|
|
"loss": 1.5768,
|
|
"mean_token_accuracy": 0.6679746866226196,
|
|
"num_tokens": 1083679331.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.6044827114937816,
|
|
"grad_norm": 0.23319305231431753,
|
|
"learning_rate": 1.0197970973650838e-06,
|
|
"loss": 1.6469,
|
|
"mean_token_accuracy": 0.6561409533023834,
|
|
"num_tokens": 1084609117.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.6058493918272516,
|
|
"grad_norm": 0.19915078803003136,
|
|
"learning_rate": 1.0162744821755674e-06,
|
|
"loss": 1.6272,
|
|
"mean_token_accuracy": 0.6592645168304443,
|
|
"num_tokens": 1085541577.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.6072160721607216,
|
|
"grad_norm": 0.29877217484902907,
|
|
"learning_rate": 1.0127518669860505e-06,
|
|
"loss": 1.6287,
|
|
"mean_token_accuracy": 0.660723865032196,
|
|
"num_tokens": 1086467001.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 1.6085827524941916,
|
|
"grad_norm": 0.37358018967109285,
|
|
"learning_rate": 1.0092292517965338e-06,
|
|
"loss": 1.686,
|
|
"mean_token_accuracy": 0.6512668669223786,
|
|
"num_tokens": 1087414988.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.6099494328276616,
|
|
"grad_norm": 0.2675263992404585,
|
|
"learning_rate": 1.0057066366070172e-06,
|
|
"loss": 1.6392,
|
|
"mean_token_accuracy": 0.6558610618114471,
|
|
"num_tokens": 1088324734.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 1.5453125,
|
|
"epoch": 1.6113161131611315,
|
|
"grad_norm": 0.16623193243885714,
|
|
"learning_rate": 1.0021840214175005e-06,
|
|
"loss": 1.548,
|
|
"mean_token_accuracy": 0.6715109169483184,
|
|
"num_tokens": 1089247461.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 1.61484375,
|
|
"epoch": 1.6126827934946015,
|
|
"grad_norm": 0.20348976890517292,
|
|
"learning_rate": 9.986614062279836e-07,
|
|
"loss": 1.6278,
|
|
"mean_token_accuracy": 0.6604547619819641,
|
|
"num_tokens": 1090190488.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.6140494738280715,
|
|
"grad_norm": 0.20740590905094727,
|
|
"learning_rate": 9.95138791038467e-07,
|
|
"loss": 1.5746,
|
|
"mean_token_accuracy": 0.667230200767517,
|
|
"num_tokens": 1091130191.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.6154161541615415,
|
|
"grad_norm": 0.2231032499345613,
|
|
"learning_rate": 9.916161758489503e-07,
|
|
"loss": 1.5906,
|
|
"mean_token_accuracy": 0.6664340496063232,
|
|
"num_tokens": 1092059734.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.6167828344950115,
|
|
"grad_norm": 0.23468485115524731,
|
|
"learning_rate": 9.880935606594336e-07,
|
|
"loss": 1.7157,
|
|
"mean_token_accuracy": 0.6463204383850097,
|
|
"num_tokens": 1093036813.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.6181495148284815,
|
|
"grad_norm": 0.24904295946896918,
|
|
"learning_rate": 9.84570945469917e-07,
|
|
"loss": 1.6174,
|
|
"mean_token_accuracy": 0.6627859830856323,
|
|
"num_tokens": 1093962873.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.6195161951619517,
|
|
"grad_norm": 0.24364764161007618,
|
|
"learning_rate": 9.810483302804003e-07,
|
|
"loss": 1.669,
|
|
"mean_token_accuracy": 0.6522782802581787,
|
|
"num_tokens": 1094875156.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.6208828754954216,
|
|
"grad_norm": 0.230811188639464,
|
|
"learning_rate": 9.775257150908836e-07,
|
|
"loss": 1.6255,
|
|
"mean_token_accuracy": 0.6593356192111969,
|
|
"num_tokens": 1095768762.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 1.6222495558288916,
|
|
"grad_norm": 0.21609563971692441,
|
|
"learning_rate": 9.740030999013668e-07,
|
|
"loss": 1.6403,
|
|
"mean_token_accuracy": 0.6578260600566864,
|
|
"num_tokens": 1096670077.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 1.55546875,
|
|
"epoch": 1.6236162361623616,
|
|
"grad_norm": 0.2216529642842737,
|
|
"learning_rate": 9.7048048471185e-07,
|
|
"loss": 1.5579,
|
|
"mean_token_accuracy": 0.6663577675819397,
|
|
"num_tokens": 1097565965.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 1.628125,
|
|
"epoch": 1.6249829164958316,
|
|
"grad_norm": 0.2010459094886648,
|
|
"learning_rate": 9.669578695223334e-07,
|
|
"loss": 1.6449,
|
|
"mean_token_accuracy": 0.656721293926239,
|
|
"num_tokens": 1098508985.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.6263495968293016,
|
|
"grad_norm": 0.18401529158747068,
|
|
"learning_rate": 9.634352543328168e-07,
|
|
"loss": 1.6161,
|
|
"mean_token_accuracy": 0.6595653057098388,
|
|
"num_tokens": 1099404008.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.6277162771627718,
|
|
"grad_norm": 0.17887164998139055,
|
|
"learning_rate": 9.599126391433e-07,
|
|
"loss": 1.6337,
|
|
"mean_token_accuracy": 0.6575253903865814,
|
|
"num_tokens": 1100318485.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 1.653125,
|
|
"epoch": 1.6290829574962418,
|
|
"grad_norm": 0.16061264091696623,
|
|
"learning_rate": 9.563900239537834e-07,
|
|
"loss": 1.6584,
|
|
"mean_token_accuracy": 0.6547501623630524,
|
|
"num_tokens": 1101230476.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 1.590625,
|
|
"epoch": 1.6304496378297118,
|
|
"grad_norm": 0.3796682807962226,
|
|
"learning_rate": 9.528674087642667e-07,
|
|
"loss": 1.6166,
|
|
"mean_token_accuracy": 0.661040997505188,
|
|
"num_tokens": 1102182598.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.6318163181631817,
|
|
"grad_norm": 0.16547069313940319,
|
|
"learning_rate": 9.4934479357475e-07,
|
|
"loss": 1.5682,
|
|
"mean_token_accuracy": 0.6707364082336426,
|
|
"num_tokens": 1103119337.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 1.7234375,
|
|
"epoch": 1.6331829984966517,
|
|
"grad_norm": 0.27579563917562605,
|
|
"learning_rate": 9.458221783852332e-07,
|
|
"loss": 1.7167,
|
|
"mean_token_accuracy": 0.6445057988166809,
|
|
"num_tokens": 1104080100.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.6345496788301217,
|
|
"grad_norm": 0.29687315115287516,
|
|
"learning_rate": 9.422995631957166e-07,
|
|
"loss": 1.593,
|
|
"mean_token_accuracy": 0.664111214876175,
|
|
"num_tokens": 1104967269.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 1.6765625,
|
|
"epoch": 1.6359163591635917,
|
|
"grad_norm": 0.2099980005837067,
|
|
"learning_rate": 9.387769480061999e-07,
|
|
"loss": 1.6828,
|
|
"mean_token_accuracy": 0.6508075416088104,
|
|
"num_tokens": 1105882901.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 1.6453125,
|
|
"epoch": 1.6372830394970617,
|
|
"grad_norm": 0.1589163818965185,
|
|
"learning_rate": 9.352543328166831e-07,
|
|
"loss": 1.6451,
|
|
"mean_token_accuracy": 0.6549432814121247,
|
|
"num_tokens": 1106845559.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.6386497198305316,
|
|
"grad_norm": 0.3263613682891053,
|
|
"learning_rate": 9.317317176271665e-07,
|
|
"loss": 1.6236,
|
|
"mean_token_accuracy": 0.6602192223072052,
|
|
"num_tokens": 1107727672.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 1.6875,
|
|
"epoch": 1.6400164001640016,
|
|
"grad_norm": 0.2731052346722345,
|
|
"learning_rate": 9.282091024376498e-07,
|
|
"loss": 1.7125,
|
|
"mean_token_accuracy": 0.6430910885334015,
|
|
"num_tokens": 1108708800.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 1.73359375,
|
|
"epoch": 1.6413830804974716,
|
|
"grad_norm": 0.3795159354638953,
|
|
"learning_rate": 9.246864872481331e-07,
|
|
"loss": 1.7456,
|
|
"mean_token_accuracy": 0.6382662236690522,
|
|
"num_tokens": 1109590091.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.6427497608309416,
|
|
"grad_norm": 0.23551595157102648,
|
|
"learning_rate": 9.211638720586164e-07,
|
|
"loss": 1.6313,
|
|
"mean_token_accuracy": 0.6568625867366791,
|
|
"num_tokens": 1110474536.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.6441164411644116,
|
|
"grad_norm": 0.3518241036561605,
|
|
"learning_rate": 9.176412568690997e-07,
|
|
"loss": 1.5894,
|
|
"mean_token_accuracy": 0.6632352709770203,
|
|
"num_tokens": 1111396491.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.6454831214978816,
|
|
"grad_norm": 0.2529757693606955,
|
|
"learning_rate": 9.14118641679583e-07,
|
|
"loss": 1.652,
|
|
"mean_token_accuracy": 0.652618819475174,
|
|
"num_tokens": 1112331818.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.6468498018313515,
|
|
"grad_norm": 0.19215683533491867,
|
|
"learning_rate": 9.105960264900663e-07,
|
|
"loss": 1.6072,
|
|
"mean_token_accuracy": 0.6627243459224701,
|
|
"num_tokens": 1113208831.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.6482164821648215,
|
|
"grad_norm": 0.1933579440839738,
|
|
"learning_rate": 9.070734113005496e-07,
|
|
"loss": 1.6348,
|
|
"mean_token_accuracy": 0.6587618052959442,
|
|
"num_tokens": 1114186468.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 1.6495831624982915,
|
|
"grad_norm": 0.22966677015568798,
|
|
"learning_rate": 9.035507961110329e-07,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.648996251821518,
|
|
"num_tokens": 1115090773.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.6509498428317615,
|
|
"grad_norm": 0.2300159701612083,
|
|
"learning_rate": 9.000281809215163e-07,
|
|
"loss": 1.6279,
|
|
"mean_token_accuracy": 0.6597879588603973,
|
|
"num_tokens": 1116004500.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 1.5640625,
|
|
"epoch": 1.6523165231652317,
|
|
"grad_norm": 0.21004386907508296,
|
|
"learning_rate": 8.965055657319995e-07,
|
|
"loss": 1.5696,
|
|
"mean_token_accuracy": 0.669169670343399,
|
|
"num_tokens": 1116921134.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.6536832034987017,
|
|
"grad_norm": 0.1616565775836976,
|
|
"learning_rate": 8.929829505424828e-07,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6612919986248016,
|
|
"num_tokens": 1117846968.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 1.5828125,
|
|
"epoch": 1.6550498838321717,
|
|
"grad_norm": 0.24058023232763587,
|
|
"learning_rate": 8.894603353529662e-07,
|
|
"loss": 1.5792,
|
|
"mean_token_accuracy": 0.6650661647319793,
|
|
"num_tokens": 1118759805.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.6564165641656416,
|
|
"grad_norm": 0.2751140578782856,
|
|
"learning_rate": 8.859377201634495e-07,
|
|
"loss": 1.6256,
|
|
"mean_token_accuracy": 0.6607782959938049,
|
|
"num_tokens": 1119722391.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 1.675,
|
|
"epoch": 1.6577832444991116,
|
|
"grad_norm": 0.161924247306187,
|
|
"learning_rate": 8.824151049739326e-07,
|
|
"loss": 1.6769,
|
|
"mean_token_accuracy": 0.6468493282794953,
|
|
"num_tokens": 1120656875.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.6591499248325816,
|
|
"grad_norm": 0.1695454909313773,
|
|
"learning_rate": 8.788924897844161e-07,
|
|
"loss": 1.6039,
|
|
"mean_token_accuracy": 0.6634563267230987,
|
|
"num_tokens": 1121616071.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.6605166051660518,
|
|
"grad_norm": 0.22326196099328835,
|
|
"learning_rate": 8.753698745948994e-07,
|
|
"loss": 1.6028,
|
|
"mean_token_accuracy": 0.663267970085144,
|
|
"num_tokens": 1122563465.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 1.57890625,
|
|
"epoch": 1.6618832854995218,
|
|
"grad_norm": 0.2930560388534829,
|
|
"learning_rate": 8.718472594053825e-07,
|
|
"loss": 1.5643,
|
|
"mean_token_accuracy": 0.6680546879768372,
|
|
"num_tokens": 1123441208.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.6632499658329918,
|
|
"grad_norm": 0.26682766976765776,
|
|
"learning_rate": 8.683246442158659e-07,
|
|
"loss": 1.6152,
|
|
"mean_token_accuracy": 0.6614244103431701,
|
|
"num_tokens": 1124338025.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 1.5890625,
|
|
"epoch": 1.6646166461664618,
|
|
"grad_norm": 0.17252915344420605,
|
|
"learning_rate": 8.648020290263492e-07,
|
|
"loss": 1.5897,
|
|
"mean_token_accuracy": 0.6646176636219024,
|
|
"num_tokens": 1125221558.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 1.6296875,
|
|
"epoch": 1.6659833264999317,
|
|
"grad_norm": 0.22281770500143386,
|
|
"learning_rate": 8.612794138368325e-07,
|
|
"loss": 1.6336,
|
|
"mean_token_accuracy": 0.6570609271526336,
|
|
"num_tokens": 1126154059.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 1.6390625,
|
|
"epoch": 1.6673500068334017,
|
|
"grad_norm": 0.21310073543808186,
|
|
"learning_rate": 8.577567986473158e-07,
|
|
"loss": 1.6436,
|
|
"mean_token_accuracy": 0.6571339070796967,
|
|
"num_tokens": 1127042415.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.6687166871668717,
|
|
"grad_norm": 0.17310065138643396,
|
|
"learning_rate": 8.542341834577991e-07,
|
|
"loss": 1.5819,
|
|
"mean_token_accuracy": 0.6662663102149964,
|
|
"num_tokens": 1127950396.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 1.56484375,
|
|
"epoch": 1.6700833675003417,
|
|
"grad_norm": 0.2729755030366577,
|
|
"learning_rate": 8.507115682682824e-07,
|
|
"loss": 1.568,
|
|
"mean_token_accuracy": 0.6674483895301819,
|
|
"num_tokens": 1128858214.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 1.6953125,
|
|
"epoch": 1.6714500478338117,
|
|
"grad_norm": 0.2078959332707891,
|
|
"learning_rate": 8.471889530787658e-07,
|
|
"loss": 1.7068,
|
|
"mean_token_accuracy": 0.6488052189350129,
|
|
"num_tokens": 1129820075.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.6728167281672817,
|
|
"grad_norm": 0.2145357181077699,
|
|
"learning_rate": 8.43666337889249e-07,
|
|
"loss": 1.6381,
|
|
"mean_token_accuracy": 0.6586169958114624,
|
|
"num_tokens": 1130730590.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.6741834085007516,
|
|
"grad_norm": 0.17976105105314674,
|
|
"learning_rate": 8.401437226997323e-07,
|
|
"loss": 1.6579,
|
|
"mean_token_accuracy": 0.6576570749282837,
|
|
"num_tokens": 1131677409.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.6755500888342216,
|
|
"grad_norm": 0.18867904884213438,
|
|
"learning_rate": 8.366211075102157e-07,
|
|
"loss": 1.6224,
|
|
"mean_token_accuracy": 0.6584068477153778,
|
|
"num_tokens": 1132542860.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 1.60703125,
|
|
"epoch": 1.6769167691676916,
|
|
"grad_norm": 0.2400043126706364,
|
|
"learning_rate": 8.330984923206989e-07,
|
|
"loss": 1.6007,
|
|
"mean_token_accuracy": 0.6617262363433838,
|
|
"num_tokens": 1133492332.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.6782834495011616,
|
|
"grad_norm": 0.22086100230000347,
|
|
"learning_rate": 8.295758771311822e-07,
|
|
"loss": 1.6856,
|
|
"mean_token_accuracy": 0.6476235926151276,
|
|
"num_tokens": 1134355882.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.6796501298346316,
|
|
"grad_norm": 0.320238907258098,
|
|
"learning_rate": 8.260532619416656e-07,
|
|
"loss": 1.6121,
|
|
"mean_token_accuracy": 0.6612754106521607,
|
|
"num_tokens": 1135251494.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.6810168101681016,
|
|
"grad_norm": 0.28653862630786764,
|
|
"learning_rate": 8.225306467521489e-07,
|
|
"loss": 1.6235,
|
|
"mean_token_accuracy": 0.6605446219444275,
|
|
"num_tokens": 1136173623.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.6823834905015715,
|
|
"grad_norm": 0.2022623336484385,
|
|
"learning_rate": 8.190080315626321e-07,
|
|
"loss": 1.6727,
|
|
"mean_token_accuracy": 0.6517422318458557,
|
|
"num_tokens": 1137067708.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.6837501708350415,
|
|
"grad_norm": 0.304619266874025,
|
|
"learning_rate": 8.154854163731155e-07,
|
|
"loss": 1.5974,
|
|
"mean_token_accuracy": 0.6645220398902894,
|
|
"num_tokens": 1138014818.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.6851168511685117,
|
|
"grad_norm": 0.2311990393887018,
|
|
"learning_rate": 8.119628011835988e-07,
|
|
"loss": 1.6686,
|
|
"mean_token_accuracy": 0.6487858951091766,
|
|
"num_tokens": 1138919264.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.6864835315019817,
|
|
"grad_norm": 0.20701271386448603,
|
|
"learning_rate": 8.084401859940821e-07,
|
|
"loss": 1.616,
|
|
"mean_token_accuracy": 0.660172700881958,
|
|
"num_tokens": 1139812343.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.6878502118354517,
|
|
"grad_norm": 0.2545546211763758,
|
|
"learning_rate": 8.049175708045654e-07,
|
|
"loss": 1.662,
|
|
"mean_token_accuracy": 0.6520895183086395,
|
|
"num_tokens": 1140746045.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 1.5796875,
|
|
"epoch": 1.6892168921689217,
|
|
"grad_norm": 0.18149195838042204,
|
|
"learning_rate": 8.013949556150487e-07,
|
|
"loss": 1.5849,
|
|
"mean_token_accuracy": 0.6658330440521241,
|
|
"num_tokens": 1141621905.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.6905835725023917,
|
|
"grad_norm": 0.18775385150390936,
|
|
"learning_rate": 7.97872340425532e-07,
|
|
"loss": 1.5608,
|
|
"mean_token_accuracy": 0.6680724143981933,
|
|
"num_tokens": 1142557292.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.6919502528358616,
|
|
"grad_norm": 0.1787475370801667,
|
|
"learning_rate": 7.943497252360153e-07,
|
|
"loss": 1.6168,
|
|
"mean_token_accuracy": 0.657504427433014,
|
|
"num_tokens": 1143456871.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 1.6703125,
|
|
"epoch": 1.6933169331693319,
|
|
"grad_norm": 0.2167471231685367,
|
|
"learning_rate": 7.908271100464986e-07,
|
|
"loss": 1.6834,
|
|
"mean_token_accuracy": 0.6472065389156342,
|
|
"num_tokens": 1144407918.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 1.6946836135028018,
|
|
"grad_norm": 0.19470311035070229,
|
|
"learning_rate": 7.873044948569819e-07,
|
|
"loss": 1.6719,
|
|
"mean_token_accuracy": 0.6540567398071289,
|
|
"num_tokens": 1145334421.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.6960502938362718,
|
|
"grad_norm": 0.3013663845864268,
|
|
"learning_rate": 7.837818796674653e-07,
|
|
"loss": 1.6176,
|
|
"mean_token_accuracy": 0.6611685395240784,
|
|
"num_tokens": 1146270573.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.6974169741697418,
|
|
"grad_norm": 0.17614683905829748,
|
|
"learning_rate": 7.802592644779484e-07,
|
|
"loss": 1.6373,
|
|
"mean_token_accuracy": 0.6548915684223175,
|
|
"num_tokens": 1147229398.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.6987836545032118,
|
|
"grad_norm": 0.2766886400947055,
|
|
"learning_rate": 7.767366492884317e-07,
|
|
"loss": 1.6415,
|
|
"mean_token_accuracy": 0.656659996509552,
|
|
"num_tokens": 1148138082.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.7001503348366818,
|
|
"grad_norm": 0.20496764727565314,
|
|
"learning_rate": 7.732140340989152e-07,
|
|
"loss": 1.625,
|
|
"mean_token_accuracy": 0.6581466615200042,
|
|
"num_tokens": 1149099762.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.7015170151701517,
|
|
"grad_norm": 0.22217784147755804,
|
|
"learning_rate": 7.696914189093985e-07,
|
|
"loss": 1.6622,
|
|
"mean_token_accuracy": 0.6539967060089111,
|
|
"num_tokens": 1150012404.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.7028836955036217,
|
|
"grad_norm": 0.207974429243522,
|
|
"learning_rate": 7.661688037198816e-07,
|
|
"loss": 1.589,
|
|
"mean_token_accuracy": 0.6636882662773133,
|
|
"num_tokens": 1150948734.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 1.56328125,
|
|
"epoch": 1.7042503758370917,
|
|
"grad_norm": 0.17000715850488235,
|
|
"learning_rate": 7.62646188530365e-07,
|
|
"loss": 1.5767,
|
|
"mean_token_accuracy": 0.6666019856929779,
|
|
"num_tokens": 1151863623.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 1.54453125,
|
|
"epoch": 1.7056170561705617,
|
|
"grad_norm": 0.2840383694121971,
|
|
"learning_rate": 7.591235733408483e-07,
|
|
"loss": 1.5397,
|
|
"mean_token_accuracy": 0.6721981704235077,
|
|
"num_tokens": 1152801253.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.7069837365040317,
|
|
"grad_norm": 0.291335107645373,
|
|
"learning_rate": 7.556009581513315e-07,
|
|
"loss": 1.5793,
|
|
"mean_token_accuracy": 0.6660524308681488,
|
|
"num_tokens": 1153710193.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 1.5296875,
|
|
"epoch": 1.7083504168375017,
|
|
"grad_norm": 0.2361909971081801,
|
|
"learning_rate": 7.520783429618149e-07,
|
|
"loss": 1.5606,
|
|
"mean_token_accuracy": 0.6696725964546204,
|
|
"num_tokens": 1154664809.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.7097170971709716,
|
|
"grad_norm": 0.3033594938207728,
|
|
"learning_rate": 7.485557277722982e-07,
|
|
"loss": 1.599,
|
|
"mean_token_accuracy": 0.6643129169940949,
|
|
"num_tokens": 1155599985.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.7110837775044416,
|
|
"grad_norm": 0.23057811173312903,
|
|
"learning_rate": 7.450331125827815e-07,
|
|
"loss": 1.6425,
|
|
"mean_token_accuracy": 0.6555361330509186,
|
|
"num_tokens": 1156535091.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.7124504578379116,
|
|
"grad_norm": 0.23302811602384974,
|
|
"learning_rate": 7.415104973932648e-07,
|
|
"loss": 1.6493,
|
|
"mean_token_accuracy": 0.6542197227478027,
|
|
"num_tokens": 1157473982.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 1.5484375,
|
|
"epoch": 1.7138171381713816,
|
|
"grad_norm": 0.26458456036037664,
|
|
"learning_rate": 7.379878822037481e-07,
|
|
"loss": 1.5447,
|
|
"mean_token_accuracy": 0.6702338039875031,
|
|
"num_tokens": 1158338388.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 1.65625,
|
|
"epoch": 1.7151838185048516,
|
|
"grad_norm": 0.1817308435887707,
|
|
"learning_rate": 7.344652670142314e-07,
|
|
"loss": 1.656,
|
|
"mean_token_accuracy": 0.6530230879783631,
|
|
"num_tokens": 1159271747.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 1.7165504988383216,
|
|
"grad_norm": 0.18325262784888238,
|
|
"learning_rate": 7.309426518247147e-07,
|
|
"loss": 1.6726,
|
|
"mean_token_accuracy": 0.6517063856124878,
|
|
"num_tokens": 1160216977.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.7179171791717918,
|
|
"grad_norm": 0.18760822698892804,
|
|
"learning_rate": 7.27420036635198e-07,
|
|
"loss": 1.6308,
|
|
"mean_token_accuracy": 0.659820294380188,
|
|
"num_tokens": 1161117334.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 1.7192838595052617,
|
|
"grad_norm": 0.21956587259688828,
|
|
"learning_rate": 7.238974214456813e-07,
|
|
"loss": 1.7066,
|
|
"mean_token_accuracy": 0.644955849647522,
|
|
"num_tokens": 1162064199.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.7206505398387317,
|
|
"grad_norm": 0.24428223830344128,
|
|
"learning_rate": 7.203748062561647e-07,
|
|
"loss": 1.6361,
|
|
"mean_token_accuracy": 0.65900918841362,
|
|
"num_tokens": 1163059526.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.7220172201722017,
|
|
"grad_norm": 0.19102409983658533,
|
|
"learning_rate": 7.168521910666479e-07,
|
|
"loss": 1.6313,
|
|
"mean_token_accuracy": 0.6593971610069275,
|
|
"num_tokens": 1163980207.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.7233839005056717,
|
|
"grad_norm": 0.22719221593289712,
|
|
"learning_rate": 7.133295758771312e-07,
|
|
"loss": 1.6443,
|
|
"mean_token_accuracy": 0.6573506951332092,
|
|
"num_tokens": 1164906430.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.7247505808391417,
|
|
"grad_norm": 0.20844116998071238,
|
|
"learning_rate": 7.098069606876146e-07,
|
|
"loss": 1.6239,
|
|
"mean_token_accuracy": 0.6592705905437469,
|
|
"num_tokens": 1165812583.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 1.56015625,
|
|
"epoch": 1.7261172611726119,
|
|
"grad_norm": 0.34483219460931697,
|
|
"learning_rate": 7.062843454980979e-07,
|
|
"loss": 1.5614,
|
|
"mean_token_accuracy": 0.6703908145427704,
|
|
"num_tokens": 1166709838.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 1.5609375,
|
|
"epoch": 1.7274839415060819,
|
|
"grad_norm": 0.16101334056256084,
|
|
"learning_rate": 7.027617303085811e-07,
|
|
"loss": 1.5634,
|
|
"mean_token_accuracy": 0.6680092632770538,
|
|
"num_tokens": 1167654316.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 1.58671875,
|
|
"epoch": 1.7288506218395518,
|
|
"grad_norm": 0.24653556700589618,
|
|
"learning_rate": 6.992391151190645e-07,
|
|
"loss": 1.6049,
|
|
"mean_token_accuracy": 0.6628724336624146,
|
|
"num_tokens": 1168593682.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.7302173021730218,
|
|
"grad_norm": 0.22856501191403453,
|
|
"learning_rate": 6.957164999295478e-07,
|
|
"loss": 1.6512,
|
|
"mean_token_accuracy": 0.6524114906787872,
|
|
"num_tokens": 1169473298.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.7315839825064918,
|
|
"grad_norm": 0.2193513675513743,
|
|
"learning_rate": 6.921938847400309e-07,
|
|
"loss": 1.607,
|
|
"mean_token_accuracy": 0.6614319682121277,
|
|
"num_tokens": 1170351324.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.7329506628399618,
|
|
"grad_norm": 0.29323548309159714,
|
|
"learning_rate": 6.886712695505144e-07,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.6595452666282654,
|
|
"num_tokens": 1171300735.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 1.69140625,
|
|
"epoch": 1.7343173431734318,
|
|
"grad_norm": 0.23340076584624803,
|
|
"learning_rate": 6.851486543609977e-07,
|
|
"loss": 1.7268,
|
|
"mean_token_accuracy": 0.6428976953029633,
|
|
"num_tokens": 1172234392.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.7356840235069018,
|
|
"grad_norm": 0.2157740369727349,
|
|
"learning_rate": 6.816260391714811e-07,
|
|
"loss": 1.6506,
|
|
"mean_token_accuracy": 0.6543066084384919,
|
|
"num_tokens": 1173141479.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.7370507038403717,
|
|
"grad_norm": 0.21865619448248225,
|
|
"learning_rate": 6.781034239819642e-07,
|
|
"loss": 1.6411,
|
|
"mean_token_accuracy": 0.655413419008255,
|
|
"num_tokens": 1174070097.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.7384173841738417,
|
|
"grad_norm": 0.21836147931675173,
|
|
"learning_rate": 6.745808087924475e-07,
|
|
"loss": 1.611,
|
|
"mean_token_accuracy": 0.6605900764465332,
|
|
"num_tokens": 1174982671.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 1.7397840645073117,
|
|
"grad_norm": 0.2735651976546412,
|
|
"learning_rate": 6.710581936029308e-07,
|
|
"loss": 1.6611,
|
|
"mean_token_accuracy": 0.6559265732765198,
|
|
"num_tokens": 1175885608.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.7411507448407817,
|
|
"grad_norm": 0.1916852968645492,
|
|
"learning_rate": 6.675355784134143e-07,
|
|
"loss": 1.6318,
|
|
"mean_token_accuracy": 0.65760617852211,
|
|
"num_tokens": 1176839929.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.7425174251742517,
|
|
"grad_norm": 0.21522827494496405,
|
|
"learning_rate": 6.640129632238974e-07,
|
|
"loss": 1.5922,
|
|
"mean_token_accuracy": 0.6678592920303345,
|
|
"num_tokens": 1177767338.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.7438841055077217,
|
|
"grad_norm": 0.2208746223336588,
|
|
"learning_rate": 6.604903480343807e-07,
|
|
"loss": 1.6747,
|
|
"mean_token_accuracy": 0.6486340761184692,
|
|
"num_tokens": 1178711413.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.7452507858411916,
|
|
"grad_norm": 0.19047851865447443,
|
|
"learning_rate": 6.569677328448641e-07,
|
|
"loss": 1.6042,
|
|
"mean_token_accuracy": 0.6661227285861969,
|
|
"num_tokens": 1179611865.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.7466174661746616,
|
|
"grad_norm": 0.31936881471566453,
|
|
"learning_rate": 6.534451176553473e-07,
|
|
"loss": 1.5484,
|
|
"mean_token_accuracy": 0.6743765234947204,
|
|
"num_tokens": 1180541124.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 1.60390625,
|
|
"epoch": 1.7479841465081316,
|
|
"grad_norm": 0.23126428932882184,
|
|
"learning_rate": 6.499225024658306e-07,
|
|
"loss": 1.6173,
|
|
"mean_token_accuracy": 0.6597241997718811,
|
|
"num_tokens": 1181487470.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.7493508268416016,
|
|
"grad_norm": 0.27291825229511507,
|
|
"learning_rate": 6.46399887276314e-07,
|
|
"loss": 1.5579,
|
|
"mean_token_accuracy": 0.6690025806427002,
|
|
"num_tokens": 1182365317.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 1.70859375,
|
|
"epoch": 1.7507175071750718,
|
|
"grad_norm": 0.19510000829730978,
|
|
"learning_rate": 6.428772720867973e-07,
|
|
"loss": 1.7059,
|
|
"mean_token_accuracy": 0.6456232070922852,
|
|
"num_tokens": 1183274501.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 1.61015625,
|
|
"epoch": 1.7520841875085418,
|
|
"grad_norm": 0.21215880296098832,
|
|
"learning_rate": 6.393546568972805e-07,
|
|
"loss": 1.6127,
|
|
"mean_token_accuracy": 0.6601386368274689,
|
|
"num_tokens": 1184164825.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.7534508678420118,
|
|
"grad_norm": 0.1788587750566349,
|
|
"learning_rate": 6.358320417077639e-07,
|
|
"loss": 1.6318,
|
|
"mean_token_accuracy": 0.658016049861908,
|
|
"num_tokens": 1185080309.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 1.59453125,
|
|
"epoch": 1.7548175481754817,
|
|
"grad_norm": 0.18610242306201188,
|
|
"learning_rate": 6.323094265182472e-07,
|
|
"loss": 1.6174,
|
|
"mean_token_accuracy": 0.6596297144889831,
|
|
"num_tokens": 1185957931.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.7561842285089517,
|
|
"grad_norm": 0.20582437018159422,
|
|
"learning_rate": 6.287868113287306e-07,
|
|
"loss": 1.5793,
|
|
"mean_token_accuracy": 0.6673372864723206,
|
|
"num_tokens": 1186910353.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 1.61171875,
|
|
"epoch": 1.7575509088424217,
|
|
"grad_norm": 0.1733203411429703,
|
|
"learning_rate": 6.252641961392138e-07,
|
|
"loss": 1.6263,
|
|
"mean_token_accuracy": 0.6582129240036011,
|
|
"num_tokens": 1187856893.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 1.546875,
|
|
"epoch": 1.758917589175892,
|
|
"grad_norm": 0.19043487115504254,
|
|
"learning_rate": 6.217415809496971e-07,
|
|
"loss": 1.5688,
|
|
"mean_token_accuracy": 0.6680092990398407,
|
|
"num_tokens": 1188780891.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.760284269509362,
|
|
"grad_norm": 0.19093851663409486,
|
|
"learning_rate": 6.182189657601804e-07,
|
|
"loss": 1.6437,
|
|
"mean_token_accuracy": 0.6565111398696899,
|
|
"num_tokens": 1189731860.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.7616509498428319,
|
|
"grad_norm": 0.2552468704689,
|
|
"learning_rate": 6.146963505706637e-07,
|
|
"loss": 1.626,
|
|
"mean_token_accuracy": 0.6596051752567291,
|
|
"num_tokens": 1190654852.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.7630176301763019,
|
|
"grad_norm": 0.15555414347196267,
|
|
"learning_rate": 6.11173735381147e-07,
|
|
"loss": 1.6049,
|
|
"mean_token_accuracy": 0.6636174142360687,
|
|
"num_tokens": 1191548478.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.7643843105097718,
|
|
"grad_norm": 0.18733893913242516,
|
|
"learning_rate": 6.076511201916304e-07,
|
|
"loss": 1.6685,
|
|
"mean_token_accuracy": 0.6541016161441803,
|
|
"num_tokens": 1192458123.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 1.63125,
|
|
"epoch": 1.7657509908432418,
|
|
"grad_norm": 0.18857487801033915,
|
|
"learning_rate": 6.041285050021136e-07,
|
|
"loss": 1.6321,
|
|
"mean_token_accuracy": 0.6559030890464783,
|
|
"num_tokens": 1193315293.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 1.671875,
|
|
"epoch": 1.7671176711767118,
|
|
"grad_norm": 0.28889685312925045,
|
|
"learning_rate": 6.006058898125969e-07,
|
|
"loss": 1.6642,
|
|
"mean_token_accuracy": 0.6533693671226501,
|
|
"num_tokens": 1194279520.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 1.6671875,
|
|
"epoch": 1.7684843515101818,
|
|
"grad_norm": 0.2156686160586647,
|
|
"learning_rate": 5.970832746230803e-07,
|
|
"loss": 1.6923,
|
|
"mean_token_accuracy": 0.6489702939987183,
|
|
"num_tokens": 1195278572.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 1.61875,
|
|
"epoch": 1.7698510318436518,
|
|
"grad_norm": 0.193835738135388,
|
|
"learning_rate": 5.935606594335636e-07,
|
|
"loss": 1.6365,
|
|
"mean_token_accuracy": 0.6580451905727387,
|
|
"num_tokens": 1196254571.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 1.64296875,
|
|
"epoch": 1.7712177121771218,
|
|
"grad_norm": 0.29236434808262424,
|
|
"learning_rate": 5.900380442440468e-07,
|
|
"loss": 1.6595,
|
|
"mean_token_accuracy": 0.6550698220729828,
|
|
"num_tokens": 1197174737.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 1.7725843925105917,
|
|
"grad_norm": 0.3491117948713931,
|
|
"learning_rate": 5.8651542905453e-07,
|
|
"loss": 1.7217,
|
|
"mean_token_accuracy": 0.6441613733768463,
|
|
"num_tokens": 1198113907.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 1.59140625,
|
|
"epoch": 1.7739510728440617,
|
|
"grad_norm": 0.23481947856091992,
|
|
"learning_rate": 5.829928138650135e-07,
|
|
"loss": 1.6084,
|
|
"mean_token_accuracy": 0.6631329178810119,
|
|
"num_tokens": 1199026617.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 1.61328125,
|
|
"epoch": 1.7753177531775317,
|
|
"grad_norm": 0.1985029179663969,
|
|
"learning_rate": 5.794701986754967e-07,
|
|
"loss": 1.6187,
|
|
"mean_token_accuracy": 0.6584881901741028,
|
|
"num_tokens": 1199940022.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.7766844335110017,
|
|
"grad_norm": 0.19976045869927897,
|
|
"learning_rate": 5.759475834859801e-07,
|
|
"loss": 1.634,
|
|
"mean_token_accuracy": 0.6599187970161438,
|
|
"num_tokens": 1200865992.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 1.68515625,
|
|
"epoch": 1.7780511138444717,
|
|
"grad_norm": 0.24244941133853193,
|
|
"learning_rate": 5.724249682964633e-07,
|
|
"loss": 1.7055,
|
|
"mean_token_accuracy": 0.6467584490776062,
|
|
"num_tokens": 1201757719.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.7794177941779417,
|
|
"grad_norm": 0.2016180100715818,
|
|
"learning_rate": 5.689023531069466e-07,
|
|
"loss": 1.6455,
|
|
"mean_token_accuracy": 0.6557511329650879,
|
|
"num_tokens": 1202729152.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.7807844745114116,
|
|
"grad_norm": 0.24018082777097607,
|
|
"learning_rate": 5.6537973791743e-07,
|
|
"loss": 1.6497,
|
|
"mean_token_accuracy": 0.6578919351100921,
|
|
"num_tokens": 1203644231.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 1.55546875,
|
|
"epoch": 1.7821511548448816,
|
|
"grad_norm": 0.1902006168396084,
|
|
"learning_rate": 5.618571227279133e-07,
|
|
"loss": 1.5624,
|
|
"mean_token_accuracy": 0.6703615307807922,
|
|
"num_tokens": 1204596313.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.7835178351783518,
|
|
"grad_norm": 0.17455987912803814,
|
|
"learning_rate": 5.583345075383965e-07,
|
|
"loss": 1.5789,
|
|
"mean_token_accuracy": 0.6662604987621308,
|
|
"num_tokens": 1205549209.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 1.57578125,
|
|
"epoch": 1.7848845155118218,
|
|
"grad_norm": 0.20994196692622852,
|
|
"learning_rate": 5.548118923488799e-07,
|
|
"loss": 1.5709,
|
|
"mean_token_accuracy": 0.6670157253742218,
|
|
"num_tokens": 1206428424.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 1.6015625,
|
|
"epoch": 1.7862511958452918,
|
|
"grad_norm": 0.17057264831184304,
|
|
"learning_rate": 5.512892771593632e-07,
|
|
"loss": 1.6066,
|
|
"mean_token_accuracy": 0.6639166533946991,
|
|
"num_tokens": 1207361675.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 1.64921875,
|
|
"epoch": 1.7876178761787618,
|
|
"grad_norm": 0.21666362385489585,
|
|
"learning_rate": 5.477666619698464e-07,
|
|
"loss": 1.6634,
|
|
"mean_token_accuracy": 0.6552066445350647,
|
|
"num_tokens": 1208303164.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.7889845565122318,
|
|
"grad_norm": 0.21527267966874286,
|
|
"learning_rate": 5.442440467803298e-07,
|
|
"loss": 1.615,
|
|
"mean_token_accuracy": 0.6590070962905884,
|
|
"num_tokens": 1209214250.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 1.63671875,
|
|
"epoch": 1.7903512368457017,
|
|
"grad_norm": 0.18666214202001283,
|
|
"learning_rate": 5.407214315908131e-07,
|
|
"loss": 1.6374,
|
|
"mean_token_accuracy": 0.6555652499198914,
|
|
"num_tokens": 1210155872.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.791717917179172,
|
|
"grad_norm": 0.22609054611931176,
|
|
"learning_rate": 5.371988164012964e-07,
|
|
"loss": 1.6991,
|
|
"mean_token_accuracy": 0.646029794216156,
|
|
"num_tokens": 1211114495.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 1.793084597512642,
|
|
"grad_norm": 0.158830765326413,
|
|
"learning_rate": 5.336762012117797e-07,
|
|
"loss": 1.5674,
|
|
"mean_token_accuracy": 0.669281142950058,
|
|
"num_tokens": 1212056316.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.794451277846112,
|
|
"grad_norm": 0.2060574497777657,
|
|
"learning_rate": 5.30153586022263e-07,
|
|
"loss": 1.6698,
|
|
"mean_token_accuracy": 0.6507681310176849,
|
|
"num_tokens": 1212998411.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.795817958179582,
|
|
"grad_norm": 0.16915262994800342,
|
|
"learning_rate": 5.266309708327462e-07,
|
|
"loss": 1.5996,
|
|
"mean_token_accuracy": 0.6639519989490509,
|
|
"num_tokens": 1213912747.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.7971846385130519,
|
|
"grad_norm": 0.21127983860204938,
|
|
"learning_rate": 5.231083556432296e-07,
|
|
"loss": 1.5805,
|
|
"mean_token_accuracy": 0.6692678391933441,
|
|
"num_tokens": 1214819541.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.7985513188465219,
|
|
"grad_norm": 0.19445125443273334,
|
|
"learning_rate": 5.195857404537129e-07,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6579931497573852,
|
|
"num_tokens": 1215734271.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.7999179991799918,
|
|
"grad_norm": 0.22672161216391182,
|
|
"learning_rate": 5.160631252641961e-07,
|
|
"loss": 1.6011,
|
|
"mean_token_accuracy": 0.6640138447284698,
|
|
"num_tokens": 1216631249.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.8012846795134618,
|
|
"grad_norm": 0.3419525863505421,
|
|
"learning_rate": 5.125405100746795e-07,
|
|
"loss": 1.6066,
|
|
"mean_token_accuracy": 0.6643938899040223,
|
|
"num_tokens": 1217582712.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 1.61796875,
|
|
"epoch": 1.8026513598469318,
|
|
"grad_norm": 0.22838791532600744,
|
|
"learning_rate": 5.090178948851628e-07,
|
|
"loss": 1.6394,
|
|
"mean_token_accuracy": 0.6530514776706695,
|
|
"num_tokens": 1218533496.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 1.703125,
|
|
"epoch": 1.8040180401804018,
|
|
"grad_norm": 0.20881616322251167,
|
|
"learning_rate": 5.054952796956461e-07,
|
|
"loss": 1.6959,
|
|
"mean_token_accuracy": 0.6450845420360565,
|
|
"num_tokens": 1219435260.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 1.559375,
|
|
"epoch": 1.8053847205138718,
|
|
"grad_norm": 0.19397556329400992,
|
|
"learning_rate": 5.019726645061294e-07,
|
|
"loss": 1.5621,
|
|
"mean_token_accuracy": 0.6712652444839478,
|
|
"num_tokens": 1220380792.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.8067514008473418,
|
|
"grad_norm": 0.21544873321143823,
|
|
"learning_rate": 4.984500493166127e-07,
|
|
"loss": 1.6749,
|
|
"mean_token_accuracy": 0.6542504310607911,
|
|
"num_tokens": 1221318937.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 1.54609375,
|
|
"epoch": 1.8081180811808117,
|
|
"grad_norm": 0.16527966890985038,
|
|
"learning_rate": 4.94927434127096e-07,
|
|
"loss": 1.558,
|
|
"mean_token_accuracy": 0.6707374155521393,
|
|
"num_tokens": 1222225162.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.8094847615142817,
|
|
"grad_norm": 0.17771769573509144,
|
|
"learning_rate": 4.914048189375794e-07,
|
|
"loss": 1.588,
|
|
"mean_token_accuracy": 0.6638980686664582,
|
|
"num_tokens": 1223137173.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.8108514418477517,
|
|
"grad_norm": 0.18300615991257957,
|
|
"learning_rate": 4.878822037480626e-07,
|
|
"loss": 1.6047,
|
|
"mean_token_accuracy": 0.6625331103801727,
|
|
"num_tokens": 1223975508.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.8122181221812217,
|
|
"grad_norm": 0.18830843319891954,
|
|
"learning_rate": 4.843595885585459e-07,
|
|
"loss": 1.6221,
|
|
"mean_token_accuracy": 0.6589700698852539,
|
|
"num_tokens": 1224874246.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 1.6796875,
|
|
"epoch": 1.8135848025146917,
|
|
"grad_norm": 0.28305056076594337,
|
|
"learning_rate": 4.808369733690292e-07,
|
|
"loss": 1.7025,
|
|
"mean_token_accuracy": 0.6467609822750091,
|
|
"num_tokens": 1225823301.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 1.7078125,
|
|
"epoch": 1.8149514828481617,
|
|
"grad_norm": 0.20173001497898244,
|
|
"learning_rate": 4.773143581795125e-07,
|
|
"loss": 1.7149,
|
|
"mean_token_accuracy": 0.6444631457328797,
|
|
"num_tokens": 1226797501.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 1.553125,
|
|
"epoch": 1.8163181631816319,
|
|
"grad_norm": 0.2583213215937663,
|
|
"learning_rate": 4.7379174298999583e-07,
|
|
"loss": 1.5558,
|
|
"mean_token_accuracy": 0.6697051167488098,
|
|
"num_tokens": 1227706937.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.8176848435151018,
|
|
"grad_norm": 0.5024606776869004,
|
|
"learning_rate": 4.702691278004791e-07,
|
|
"loss": 1.6668,
|
|
"mean_token_accuracy": 0.6544624924659729,
|
|
"num_tokens": 1228668419.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.8190515238485718,
|
|
"grad_norm": 0.2476795685293577,
|
|
"learning_rate": 4.6674651261096245e-07,
|
|
"loss": 1.6195,
|
|
"mean_token_accuracy": 0.6590342044830322,
|
|
"num_tokens": 1229596481.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 1.565625,
|
|
"epoch": 1.8204182041820418,
|
|
"grad_norm": 0.4291036126772435,
|
|
"learning_rate": 4.632238974214457e-07,
|
|
"loss": 1.5761,
|
|
"mean_token_accuracy": 0.6672273874282837,
|
|
"num_tokens": 1230526375.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 1.59765625,
|
|
"epoch": 1.8217848845155118,
|
|
"grad_norm": 0.2736322859584959,
|
|
"learning_rate": 4.5970128223192907e-07,
|
|
"loss": 1.5969,
|
|
"mean_token_accuracy": 0.6650737881660461,
|
|
"num_tokens": 1231452631.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 1.5484375,
|
|
"epoch": 1.8231515648489818,
|
|
"grad_norm": 0.22817431486107909,
|
|
"learning_rate": 4.561786670424123e-07,
|
|
"loss": 1.5531,
|
|
"mean_token_accuracy": 0.6680522501468659,
|
|
"num_tokens": 1232356642.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.824518245182452,
|
|
"grad_norm": 0.2535978362468864,
|
|
"learning_rate": 4.5265605185289563e-07,
|
|
"loss": 1.6752,
|
|
"mean_token_accuracy": 0.6533879101276397,
|
|
"num_tokens": 1233307650.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 1.584375,
|
|
"epoch": 1.825884925515922,
|
|
"grad_norm": 0.21546969453216241,
|
|
"learning_rate": 4.491334366633789e-07,
|
|
"loss": 1.5767,
|
|
"mean_token_accuracy": 0.6675866305828094,
|
|
"num_tokens": 1234250908.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 1.58515625,
|
|
"epoch": 1.827251605849392,
|
|
"grad_norm": 0.26541285410690146,
|
|
"learning_rate": 4.456108214738622e-07,
|
|
"loss": 1.5839,
|
|
"mean_token_accuracy": 0.6653630673885346,
|
|
"num_tokens": 1235179374.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.828618286182862,
|
|
"grad_norm": 0.18197626339562778,
|
|
"learning_rate": 4.4208820628434553e-07,
|
|
"loss": 1.6419,
|
|
"mean_token_accuracy": 0.6569281637668609,
|
|
"num_tokens": 1236101663.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.829984966516332,
|
|
"grad_norm": 0.1710649043117985,
|
|
"learning_rate": 4.385655910948288e-07,
|
|
"loss": 1.6568,
|
|
"mean_token_accuracy": 0.6527640044689178,
|
|
"num_tokens": 1237028995.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.831351646849802,
|
|
"grad_norm": 0.3431384447177329,
|
|
"learning_rate": 4.3504297590531215e-07,
|
|
"loss": 1.6562,
|
|
"mean_token_accuracy": 0.6537882626056671,
|
|
"num_tokens": 1237966836.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 1.575,
|
|
"epoch": 1.8327183271832719,
|
|
"grad_norm": 0.19610199947681223,
|
|
"learning_rate": 4.3152036071579543e-07,
|
|
"loss": 1.6181,
|
|
"mean_token_accuracy": 0.6624767005443573,
|
|
"num_tokens": 1238915443.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 1.55859375,
|
|
"epoch": 1.8340850075167419,
|
|
"grad_norm": 0.15112442381961305,
|
|
"learning_rate": 4.2799774552627877e-07,
|
|
"loss": 1.5714,
|
|
"mean_token_accuracy": 0.6679113209247589,
|
|
"num_tokens": 1239860821.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 1.5875,
|
|
"epoch": 1.8354516878502118,
|
|
"grad_norm": 0.2320930449024873,
|
|
"learning_rate": 4.2447513033676205e-07,
|
|
"loss": 1.5995,
|
|
"mean_token_accuracy": 0.6632713437080383,
|
|
"num_tokens": 1240770943.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 1.5953125,
|
|
"epoch": 1.8368183681836818,
|
|
"grad_norm": 0.18768661862310929,
|
|
"learning_rate": 4.209525151472454e-07,
|
|
"loss": 1.5818,
|
|
"mean_token_accuracy": 0.6663322210311889,
|
|
"num_tokens": 1241672821.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.8381850485171518,
|
|
"grad_norm": 0.2330063510090964,
|
|
"learning_rate": 4.1742989995772867e-07,
|
|
"loss": 1.6437,
|
|
"mean_token_accuracy": 0.6525840759277344,
|
|
"num_tokens": 1242622369.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.8395517288506218,
|
|
"grad_norm": 0.21522501751375436,
|
|
"learning_rate": 4.13907284768212e-07,
|
|
"loss": 1.7026,
|
|
"mean_token_accuracy": 0.6474579095840454,
|
|
"num_tokens": 1243514242.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.8409184091840918,
|
|
"grad_norm": 0.32421102059483303,
|
|
"learning_rate": 4.1038466957869523e-07,
|
|
"loss": 1.6372,
|
|
"mean_token_accuracy": 0.6585109531879425,
|
|
"num_tokens": 1244413030.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 1.68125,
|
|
"epoch": 1.8422850895175618,
|
|
"grad_norm": 0.20089621941817037,
|
|
"learning_rate": 4.068620543891785e-07,
|
|
"loss": 1.6855,
|
|
"mean_token_accuracy": 0.6527083337306976,
|
|
"num_tokens": 1245408873.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 1.63359375,
|
|
"epoch": 1.8436517698510317,
|
|
"grad_norm": 0.2387955897711863,
|
|
"learning_rate": 4.0333943919966185e-07,
|
|
"loss": 1.6529,
|
|
"mean_token_accuracy": 0.6560206294059754,
|
|
"num_tokens": 1246389319.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 1.55,
|
|
"epoch": 1.8450184501845017,
|
|
"grad_norm": 0.1890149124520132,
|
|
"learning_rate": 3.9981682401014513e-07,
|
|
"loss": 1.5477,
|
|
"mean_token_accuracy": 0.6721786141395569,
|
|
"num_tokens": 1247357540.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.8463851305179717,
|
|
"grad_norm": 0.2234850122527014,
|
|
"learning_rate": 3.9629420882062847e-07,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6622624874114991,
|
|
"num_tokens": 1248274925.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 1.665625,
|
|
"epoch": 1.8477518108514417,
|
|
"grad_norm": 0.19154950529143203,
|
|
"learning_rate": 3.9277159363111175e-07,
|
|
"loss": 1.6473,
|
|
"mean_token_accuracy": 0.6576918184757232,
|
|
"num_tokens": 1249209035.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 1.6265625,
|
|
"epoch": 1.849118491184912,
|
|
"grad_norm": 0.1962968015704732,
|
|
"learning_rate": 3.892489784415951e-07,
|
|
"loss": 1.6383,
|
|
"mean_token_accuracy": 0.6553558051586151,
|
|
"num_tokens": 1250118691.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.8504851715183819,
|
|
"grad_norm": 0.23303111946008181,
|
|
"learning_rate": 3.8572636325207837e-07,
|
|
"loss": 1.6512,
|
|
"mean_token_accuracy": 0.6538998782634735,
|
|
"num_tokens": 1250971158.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 1.69453125,
|
|
"epoch": 1.8518518518518519,
|
|
"grad_norm": 0.1601010273853784,
|
|
"learning_rate": 3.822037480625617e-07,
|
|
"loss": 1.6962,
|
|
"mean_token_accuracy": 0.6491588354110718,
|
|
"num_tokens": 1251895668.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 1.55390625,
|
|
"epoch": 1.8532185321853218,
|
|
"grad_norm": 0.28668967502664594,
|
|
"learning_rate": 3.78681132873045e-07,
|
|
"loss": 1.5542,
|
|
"mean_token_accuracy": 0.6691259562969207,
|
|
"num_tokens": 1252889467.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 1.67578125,
|
|
"epoch": 1.8545852125187918,
|
|
"grad_norm": 0.23871344172103573,
|
|
"learning_rate": 3.7515851768352827e-07,
|
|
"loss": 1.6643,
|
|
"mean_token_accuracy": 0.6530919432640075,
|
|
"num_tokens": 1253803986.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 1.6515625,
|
|
"epoch": 1.8559518928522618,
|
|
"grad_norm": 0.25137007103206466,
|
|
"learning_rate": 3.716359024940116e-07,
|
|
"loss": 1.6738,
|
|
"mean_token_accuracy": 0.6494036972522735,
|
|
"num_tokens": 1254691251.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.857318573185732,
|
|
"grad_norm": 0.2356207038767937,
|
|
"learning_rate": 3.6811328730449484e-07,
|
|
"loss": 1.6601,
|
|
"mean_token_accuracy": 0.6522770643234252,
|
|
"num_tokens": 1255630605.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 1.6140625,
|
|
"epoch": 1.858685253519202,
|
|
"grad_norm": 0.1889060632170119,
|
|
"learning_rate": 3.645906721149782e-07,
|
|
"loss": 1.6137,
|
|
"mean_token_accuracy": 0.6599112391471863,
|
|
"num_tokens": 1256499197.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 1.65546875,
|
|
"epoch": 1.860051933852672,
|
|
"grad_norm": 0.3888811232681689,
|
|
"learning_rate": 3.6106805692546145e-07,
|
|
"loss": 1.6733,
|
|
"mean_token_accuracy": 0.6512362062931061,
|
|
"num_tokens": 1257465550.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 1.68359375,
|
|
"epoch": 1.861418614186142,
|
|
"grad_norm": 0.2115194894942806,
|
|
"learning_rate": 3.575454417359448e-07,
|
|
"loss": 1.6815,
|
|
"mean_token_accuracy": 0.6493850767612457,
|
|
"num_tokens": 1258365889.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.862785294519612,
|
|
"grad_norm": 0.2003385225454913,
|
|
"learning_rate": 3.5402282654642807e-07,
|
|
"loss": 1.6052,
|
|
"mean_token_accuracy": 0.6628158569335938,
|
|
"num_tokens": 1259268941.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.864151974853082,
|
|
"grad_norm": 0.21299045056479926,
|
|
"learning_rate": 3.505002113569114e-07,
|
|
"loss": 1.6528,
|
|
"mean_token_accuracy": 0.6534935235977173,
|
|
"num_tokens": 1260212326.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 1.6234375,
|
|
"epoch": 1.865518655186552,
|
|
"grad_norm": 0.31729511347558836,
|
|
"learning_rate": 3.469775961673947e-07,
|
|
"loss": 1.6558,
|
|
"mean_token_accuracy": 0.6558128476142884,
|
|
"num_tokens": 1261098964.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 1.5609375,
|
|
"epoch": 1.866885335520022,
|
|
"grad_norm": 0.19309018528474223,
|
|
"learning_rate": 3.43454980977878e-07,
|
|
"loss": 1.5738,
|
|
"mean_token_accuracy": 0.6714577376842499,
|
|
"num_tokens": 1262006498.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 1.66015625,
|
|
"epoch": 1.8682520158534919,
|
|
"grad_norm": 0.24428239154222225,
|
|
"learning_rate": 3.399323657883613e-07,
|
|
"loss": 1.6512,
|
|
"mean_token_accuracy": 0.6539737045764923,
|
|
"num_tokens": 1262919487.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 1.66953125,
|
|
"epoch": 1.8696186961869619,
|
|
"grad_norm": 0.19674406751911824,
|
|
"learning_rate": 3.364097505988446e-07,
|
|
"loss": 1.683,
|
|
"mean_token_accuracy": 0.6496703445911407,
|
|
"num_tokens": 1263877030.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 1.6734375,
|
|
"epoch": 1.8709853765204318,
|
|
"grad_norm": 0.19427277272273058,
|
|
"learning_rate": 3.328871354093279e-07,
|
|
"loss": 1.6897,
|
|
"mean_token_accuracy": 0.6475345492362976,
|
|
"num_tokens": 1264833802.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.8723520568539018,
|
|
"grad_norm": 0.18880463343435175,
|
|
"learning_rate": 3.293645202198112e-07,
|
|
"loss": 1.6274,
|
|
"mean_token_accuracy": 0.6571232736110687,
|
|
"num_tokens": 1265766821.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 1.63046875,
|
|
"epoch": 1.8737187371873718,
|
|
"grad_norm": 0.20405817245583208,
|
|
"learning_rate": 3.2584190503029454e-07,
|
|
"loss": 1.6385,
|
|
"mean_token_accuracy": 0.6561318695545196,
|
|
"num_tokens": 1266654129.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 1.58828125,
|
|
"epoch": 1.8750854175208418,
|
|
"grad_norm": 0.2027340091028624,
|
|
"learning_rate": 3.223192898407778e-07,
|
|
"loss": 1.6071,
|
|
"mean_token_accuracy": 0.6635313987731933,
|
|
"num_tokens": 1267587811.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 1.6171875,
|
|
"epoch": 1.8764520978543118,
|
|
"grad_norm": 0.2996054368218284,
|
|
"learning_rate": 3.1879667465126116e-07,
|
|
"loss": 1.6328,
|
|
"mean_token_accuracy": 0.6626071214675904,
|
|
"num_tokens": 1268527458.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.8778187781877818,
|
|
"grad_norm": 0.15850291315667556,
|
|
"learning_rate": 3.152740594617444e-07,
|
|
"loss": 1.6438,
|
|
"mean_token_accuracy": 0.65880588889122,
|
|
"num_tokens": 1269507989.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 1.646875,
|
|
"epoch": 1.8791854585212517,
|
|
"grad_norm": 0.2161715899105692,
|
|
"learning_rate": 3.117514442722277e-07,
|
|
"loss": 1.6447,
|
|
"mean_token_accuracy": 0.6589183449745178,
|
|
"num_tokens": 1270461385.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 1.53203125,
|
|
"epoch": 1.8805521388547217,
|
|
"grad_norm": 0.2833977308708793,
|
|
"learning_rate": 3.08228829082711e-07,
|
|
"loss": 1.5417,
|
|
"mean_token_accuracy": 0.671460646390915,
|
|
"num_tokens": 1271430904.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 1.640625,
|
|
"epoch": 1.881918819188192,
|
|
"grad_norm": 0.22055343211312964,
|
|
"learning_rate": 3.0470621389319434e-07,
|
|
"loss": 1.6429,
|
|
"mean_token_accuracy": 0.6574861943721771,
|
|
"num_tokens": 1272333058.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.883285499521662,
|
|
"grad_norm": 0.17992722470430633,
|
|
"learning_rate": 3.0118359870367763e-07,
|
|
"loss": 1.5923,
|
|
"mean_token_accuracy": 0.6660229146480561,
|
|
"num_tokens": 1273213989.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 1.60078125,
|
|
"epoch": 1.884652179855132,
|
|
"grad_norm": 0.24619157226215813,
|
|
"learning_rate": 2.9766098351416096e-07,
|
|
"loss": 1.5881,
|
|
"mean_token_accuracy": 0.6664037883281708,
|
|
"num_tokens": 1274148336.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 1.63984375,
|
|
"epoch": 1.8860188601886019,
|
|
"grad_norm": 0.16320448496596776,
|
|
"learning_rate": 2.9413836832464424e-07,
|
|
"loss": 1.6602,
|
|
"mean_token_accuracy": 0.6534466326236725,
|
|
"num_tokens": 1275110754.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.8873855405220719,
|
|
"grad_norm": 0.21059785841379455,
|
|
"learning_rate": 2.906157531351276e-07,
|
|
"loss": 1.6706,
|
|
"mean_token_accuracy": 0.6537244021892548,
|
|
"num_tokens": 1276044747.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 1.63515625,
|
|
"epoch": 1.8887522208555418,
|
|
"grad_norm": 0.28352017340109337,
|
|
"learning_rate": 2.8709313794561086e-07,
|
|
"loss": 1.6346,
|
|
"mean_token_accuracy": 0.6599922955036164,
|
|
"num_tokens": 1276996534.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 1.6375,
|
|
"epoch": 1.890118901189012,
|
|
"grad_norm": 0.18928781716988552,
|
|
"learning_rate": 2.8357052275609415e-07,
|
|
"loss": 1.6368,
|
|
"mean_token_accuracy": 0.6567727982997894,
|
|
"num_tokens": 1277875092.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 1.891485581522482,
|
|
"grad_norm": 0.20469494393569582,
|
|
"learning_rate": 2.8004790756657743e-07,
|
|
"loss": 1.731,
|
|
"mean_token_accuracy": 0.6467212021350861,
|
|
"num_tokens": 1278801125.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 1.634375,
|
|
"epoch": 1.892852261855952,
|
|
"grad_norm": 0.22924628143376438,
|
|
"learning_rate": 2.7652529237706076e-07,
|
|
"loss": 1.6246,
|
|
"mean_token_accuracy": 0.65994313955307,
|
|
"num_tokens": 1279726873.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 1.55859375,
|
|
"epoch": 1.894218942189422,
|
|
"grad_norm": 0.17477531246486777,
|
|
"learning_rate": 2.7300267718754405e-07,
|
|
"loss": 1.5659,
|
|
"mean_token_accuracy": 0.6690386474132538,
|
|
"num_tokens": 1280621870.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 1.52109375,
|
|
"epoch": 1.895585622522892,
|
|
"grad_norm": 0.16325380998292782,
|
|
"learning_rate": 2.694800619980274e-07,
|
|
"loss": 1.539,
|
|
"mean_token_accuracy": 0.6752135872840881,
|
|
"num_tokens": 1281550983.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.896952302856362,
|
|
"grad_norm": 0.21473631862066103,
|
|
"learning_rate": 2.6595744680851066e-07,
|
|
"loss": 1.6065,
|
|
"mean_token_accuracy": 0.6608222007751465,
|
|
"num_tokens": 1282508007.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.898318983189832,
|
|
"grad_norm": 0.17947770155508885,
|
|
"learning_rate": 2.6243483161899395e-07,
|
|
"loss": 1.6754,
|
|
"mean_token_accuracy": 0.6520252108573914,
|
|
"num_tokens": 1283413689.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 1.66640625,
|
|
"epoch": 1.899685663523302,
|
|
"grad_norm": 0.1907198019929457,
|
|
"learning_rate": 2.589122164294773e-07,
|
|
"loss": 1.6638,
|
|
"mean_token_accuracy": 0.6535060048103333,
|
|
"num_tokens": 1284385503.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 1.64765625,
|
|
"epoch": 1.901052343856772,
|
|
"grad_norm": 0.21053404520319421,
|
|
"learning_rate": 2.5538960123996056e-07,
|
|
"loss": 1.6741,
|
|
"mean_token_accuracy": 0.6536179900169372,
|
|
"num_tokens": 1285302973.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 1.60859375,
|
|
"epoch": 1.902419024190242,
|
|
"grad_norm": 0.2641089121799922,
|
|
"learning_rate": 2.518669860504439e-07,
|
|
"loss": 1.6199,
|
|
"mean_token_accuracy": 0.656044852733612,
|
|
"num_tokens": 1286175510.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.9037857045237119,
|
|
"grad_norm": 0.18738460537858767,
|
|
"learning_rate": 2.483443708609272e-07,
|
|
"loss": 1.6486,
|
|
"mean_token_accuracy": 0.652789956331253,
|
|
"num_tokens": 1287080634.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 1.5671875,
|
|
"epoch": 1.9051523848571819,
|
|
"grad_norm": 0.20538831830529988,
|
|
"learning_rate": 2.4482175567141046e-07,
|
|
"loss": 1.5757,
|
|
"mean_token_accuracy": 0.6656188249588013,
|
|
"num_tokens": 1288020989.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 1.6890625,
|
|
"epoch": 1.9065190651906518,
|
|
"grad_norm": 0.18613286099981097,
|
|
"learning_rate": 2.4129914048189375e-07,
|
|
"loss": 1.6962,
|
|
"mean_token_accuracy": 0.647197014093399,
|
|
"num_tokens": 1288988808.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 1.64609375,
|
|
"epoch": 1.9078857455241218,
|
|
"grad_norm": 0.288062999193962,
|
|
"learning_rate": 2.3777652529237708e-07,
|
|
"loss": 1.6756,
|
|
"mean_token_accuracy": 0.6537116706371308,
|
|
"num_tokens": 1289927084.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 1.6609375,
|
|
"epoch": 1.9092524258575918,
|
|
"grad_norm": 0.23567081538402804,
|
|
"learning_rate": 2.342539101028604e-07,
|
|
"loss": 1.6873,
|
|
"mean_token_accuracy": 0.6516539692878723,
|
|
"num_tokens": 1290895128.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 1.6109375,
|
|
"epoch": 1.9106191061910618,
|
|
"grad_norm": 0.283546834511967,
|
|
"learning_rate": 2.307312949133437e-07,
|
|
"loss": 1.6177,
|
|
"mean_token_accuracy": 0.6591763973236084,
|
|
"num_tokens": 1291803625.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 1.5765625,
|
|
"epoch": 1.9119857865245318,
|
|
"grad_norm": 0.27808546026911934,
|
|
"learning_rate": 2.2720867972382698e-07,
|
|
"loss": 1.5914,
|
|
"mean_token_accuracy": 0.6645192563533783,
|
|
"num_tokens": 1292722957.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 1.64375,
|
|
"epoch": 1.9133524668580018,
|
|
"grad_norm": 0.21065962906623298,
|
|
"learning_rate": 2.236860645343103e-07,
|
|
"loss": 1.6245,
|
|
"mean_token_accuracy": 0.6589625597000122,
|
|
"num_tokens": 1293621754.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.914719147191472,
|
|
"grad_norm": 0.29357061345769286,
|
|
"learning_rate": 2.201634493447936e-07,
|
|
"loss": 1.6085,
|
|
"mean_token_accuracy": 0.6619774043560028,
|
|
"num_tokens": 1294515703.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 1.5921875,
|
|
"epoch": 1.916085827524942,
|
|
"grad_norm": 0.1831479333305406,
|
|
"learning_rate": 2.166408341552769e-07,
|
|
"loss": 1.6091,
|
|
"mean_token_accuracy": 0.6610323429107666,
|
|
"num_tokens": 1295417865.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 1.62734375,
|
|
"epoch": 1.917452507858412,
|
|
"grad_norm": 0.1759738254582884,
|
|
"learning_rate": 2.131182189657602e-07,
|
|
"loss": 1.6231,
|
|
"mean_token_accuracy": 0.6602392315864563,
|
|
"num_tokens": 1296331317.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 1.615625,
|
|
"epoch": 1.918819188191882,
|
|
"grad_norm": 0.23694785058707052,
|
|
"learning_rate": 2.095956037762435e-07,
|
|
"loss": 1.637,
|
|
"mean_token_accuracy": 0.655489444732666,
|
|
"num_tokens": 1297254111.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 1.63203125,
|
|
"epoch": 1.920185868525352,
|
|
"grad_norm": 0.2694187016036195,
|
|
"learning_rate": 2.0607298858672678e-07,
|
|
"loss": 1.6465,
|
|
"mean_token_accuracy": 0.659600031375885,
|
|
"num_tokens": 1298187007.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.9215525488588219,
|
|
"grad_norm": 0.35238442256535907,
|
|
"learning_rate": 2.025503733972101e-07,
|
|
"loss": 1.6027,
|
|
"mean_token_accuracy": 0.6645530998706818,
|
|
"num_tokens": 1299121378.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 1.70390625,
|
|
"epoch": 1.922919229192292,
|
|
"grad_norm": 0.21302076953063093,
|
|
"learning_rate": 1.990277582076934e-07,
|
|
"loss": 1.7155,
|
|
"mean_token_accuracy": 0.6459619283676148,
|
|
"num_tokens": 1300078986.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.924285909525762,
|
|
"grad_norm": 0.2009804687666972,
|
|
"learning_rate": 1.955051430181767e-07,
|
|
"loss": 1.5716,
|
|
"mean_token_accuracy": 0.6642172157764434,
|
|
"num_tokens": 1300947341.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 1.56171875,
|
|
"epoch": 1.925652589859232,
|
|
"grad_norm": 0.18877356715540516,
|
|
"learning_rate": 1.9198252782866002e-07,
|
|
"loss": 1.5608,
|
|
"mean_token_accuracy": 0.6709876835346222,
|
|
"num_tokens": 1301870151.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 1.59375,
|
|
"epoch": 1.927019270192702,
|
|
"grad_norm": 0.1904116922943958,
|
|
"learning_rate": 1.8845991263914333e-07,
|
|
"loss": 1.6083,
|
|
"mean_token_accuracy": 0.6619009554386139,
|
|
"num_tokens": 1302800764.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 1.6421875,
|
|
"epoch": 1.928385950526172,
|
|
"grad_norm": 0.19717173001260577,
|
|
"learning_rate": 1.8493729744962664e-07,
|
|
"loss": 1.6332,
|
|
"mean_token_accuracy": 0.6577211201190949,
|
|
"num_tokens": 1303648983.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.929752630859642,
|
|
"grad_norm": 0.6813684775022584,
|
|
"learning_rate": 1.8141468226010995e-07,
|
|
"loss": 1.6562,
|
|
"mean_token_accuracy": 0.6551967561244965,
|
|
"num_tokens": 1304591100.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 1.58359375,
|
|
"epoch": 1.931119311193112,
|
|
"grad_norm": 0.3300432086880774,
|
|
"learning_rate": 1.778920670705932e-07,
|
|
"loss": 1.5771,
|
|
"mean_token_accuracy": 0.6677958369255066,
|
|
"num_tokens": 1305519778.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 1.61640625,
|
|
"epoch": 1.932485991526582,
|
|
"grad_norm": 0.18360672836835876,
|
|
"learning_rate": 1.743694518810765e-07,
|
|
"loss": 1.627,
|
|
"mean_token_accuracy": 0.6581778407096863,
|
|
"num_tokens": 1306464230.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 1.63828125,
|
|
"epoch": 1.933852671860052,
|
|
"grad_norm": 0.24740860901558143,
|
|
"learning_rate": 1.7084683669155982e-07,
|
|
"loss": 1.6202,
|
|
"mean_token_accuracy": 0.6609730303287507,
|
|
"num_tokens": 1307400065.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 1.6484375,
|
|
"epoch": 1.935219352193522,
|
|
"grad_norm": 0.33893803066800204,
|
|
"learning_rate": 1.6732422150204313e-07,
|
|
"loss": 1.6462,
|
|
"mean_token_accuracy": 0.6561708390712738,
|
|
"num_tokens": 1308298919.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.936586032526992,
|
|
"grad_norm": 0.24007246501405216,
|
|
"learning_rate": 1.6380160631252644e-07,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.6605499029159546,
|
|
"num_tokens": 1309268465.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 1.5609375,
|
|
"epoch": 1.937952712860462,
|
|
"grad_norm": 0.2663573399375865,
|
|
"learning_rate": 1.6027899112300975e-07,
|
|
"loss": 1.5574,
|
|
"mean_token_accuracy": 0.6697730779647827,
|
|
"num_tokens": 1310180742.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 1.59609375,
|
|
"epoch": 1.9393193931939319,
|
|
"grad_norm": 0.17339828634494783,
|
|
"learning_rate": 1.5675637593349303e-07,
|
|
"loss": 1.5922,
|
|
"mean_token_accuracy": 0.666955292224884,
|
|
"num_tokens": 1311138236.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 1.62578125,
|
|
"epoch": 1.9406860735274019,
|
|
"grad_norm": 0.18170085742461253,
|
|
"learning_rate": 1.5323376074397634e-07,
|
|
"loss": 1.6196,
|
|
"mean_token_accuracy": 0.6594277918338776,
|
|
"num_tokens": 1312085446.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 1.71484375,
|
|
"epoch": 1.9420527538608718,
|
|
"grad_norm": 0.2042128331788708,
|
|
"learning_rate": 1.4971114555445965e-07,
|
|
"loss": 1.7286,
|
|
"mean_token_accuracy": 0.6401301503181458,
|
|
"num_tokens": 1313017944.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 1.621875,
|
|
"epoch": 1.9434194341943418,
|
|
"grad_norm": 0.1891659084650714,
|
|
"learning_rate": 1.4618853036494293e-07,
|
|
"loss": 1.6565,
|
|
"mean_token_accuracy": 0.6542292833328247,
|
|
"num_tokens": 1313917794.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 1.6359375,
|
|
"epoch": 1.9447861145278118,
|
|
"grad_norm": 0.2461506185191629,
|
|
"learning_rate": 1.4266591517542624e-07,
|
|
"loss": 1.6482,
|
|
"mean_token_accuracy": 0.6567415475845337,
|
|
"num_tokens": 1314814233.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.9461527948612818,
|
|
"grad_norm": 0.2515692547770151,
|
|
"learning_rate": 1.3914329998590955e-07,
|
|
"loss": 1.5935,
|
|
"mean_token_accuracy": 0.6649889767169952,
|
|
"num_tokens": 1315678577.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 1.6,
|
|
"epoch": 1.947519475194752,
|
|
"grad_norm": 0.22130154818684689,
|
|
"learning_rate": 1.3562068479639286e-07,
|
|
"loss": 1.6085,
|
|
"mean_token_accuracy": 0.6620361328125,
|
|
"num_tokens": 1316642774.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.948886155528222,
|
|
"grad_norm": 0.2233355146956501,
|
|
"learning_rate": 1.3209806960687614e-07,
|
|
"loss": 1.5587,
|
|
"mean_token_accuracy": 0.6692114353179932,
|
|
"num_tokens": 1317529023.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 1.603125,
|
|
"epoch": 1.950252835861692,
|
|
"grad_norm": 0.2053800445461783,
|
|
"learning_rate": 1.2857545441735945e-07,
|
|
"loss": 1.6208,
|
|
"mean_token_accuracy": 0.6594499588012696,
|
|
"num_tokens": 1318438703.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 1.65078125,
|
|
"epoch": 1.951619516195162,
|
|
"grad_norm": 0.246106700799789,
|
|
"learning_rate": 1.2505283922784276e-07,
|
|
"loss": 1.649,
|
|
"mean_token_accuracy": 0.658135038614273,
|
|
"num_tokens": 1319396533.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.952986196528632,
|
|
"grad_norm": 0.19104205860073012,
|
|
"learning_rate": 1.2153022403832607e-07,
|
|
"loss": 1.5954,
|
|
"mean_token_accuracy": 0.6629545629024506,
|
|
"num_tokens": 1320331862.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 1.954352876862102,
|
|
"grad_norm": 0.20417520910038137,
|
|
"learning_rate": 1.1800760884880937e-07,
|
|
"loss": 1.646,
|
|
"mean_token_accuracy": 0.6546025156974793,
|
|
"num_tokens": 1321296761.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 1.6046875,
|
|
"epoch": 1.9557195571955721,
|
|
"grad_norm": 0.1831997630395628,
|
|
"learning_rate": 1.1448499365929266e-07,
|
|
"loss": 1.6114,
|
|
"mean_token_accuracy": 0.6595677196979522,
|
|
"num_tokens": 1322226910.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 1.65234375,
|
|
"epoch": 1.957086237529042,
|
|
"grad_norm": 0.23930541817672732,
|
|
"learning_rate": 1.1096237846977597e-07,
|
|
"loss": 1.6554,
|
|
"mean_token_accuracy": 0.6556226134300231,
|
|
"num_tokens": 1323142629.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 1.65390625,
|
|
"epoch": 1.958452917862512,
|
|
"grad_norm": 0.32705180874507495,
|
|
"learning_rate": 1.0743976328025928e-07,
|
|
"loss": 1.6495,
|
|
"mean_token_accuracy": 0.653829550743103,
|
|
"num_tokens": 1324071925.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.959819598195982,
|
|
"grad_norm": 0.20616917346537544,
|
|
"learning_rate": 1.0391714809074258e-07,
|
|
"loss": 1.5958,
|
|
"mean_token_accuracy": 0.6653057754039764,
|
|
"num_tokens": 1325008766.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 1.596875,
|
|
"epoch": 1.961186278529452,
|
|
"grad_norm": 0.24381529105348995,
|
|
"learning_rate": 1.0039453290122588e-07,
|
|
"loss": 1.6176,
|
|
"mean_token_accuracy": 0.6587579429149628,
|
|
"num_tokens": 1325919896.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 1.5359375,
|
|
"epoch": 1.962552958862922,
|
|
"grad_norm": 0.28420815000573224,
|
|
"learning_rate": 9.687191771170918e-08,
|
|
"loss": 1.5402,
|
|
"mean_token_accuracy": 0.6715553343296051,
|
|
"num_tokens": 1326822733.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 1.6546875,
|
|
"epoch": 1.963919639196392,
|
|
"grad_norm": 0.20657189087422986,
|
|
"learning_rate": 9.334930252219248e-08,
|
|
"loss": 1.6725,
|
|
"mean_token_accuracy": 0.6524428963661194,
|
|
"num_tokens": 1327784632.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.965286319529862,
|
|
"grad_norm": 0.18944414462895545,
|
|
"learning_rate": 8.982668733267578e-08,
|
|
"loss": 1.6167,
|
|
"mean_token_accuracy": 0.6606287062168121,
|
|
"num_tokens": 1328729313.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 1.56484375,
|
|
"epoch": 1.966652999863332,
|
|
"grad_norm": 0.29654270592701265,
|
|
"learning_rate": 8.630407214315909e-08,
|
|
"loss": 1.5728,
|
|
"mean_token_accuracy": 0.6692606091499329,
|
|
"num_tokens": 1329645532.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 1.7046875,
|
|
"epoch": 1.968019680196802,
|
|
"grad_norm": 0.27201150626298903,
|
|
"learning_rate": 8.27814569536424e-08,
|
|
"loss": 1.71,
|
|
"mean_token_accuracy": 0.6491980493068695,
|
|
"num_tokens": 1330581030.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 1.64140625,
|
|
"epoch": 1.969386360530272,
|
|
"grad_norm": 0.274940647700783,
|
|
"learning_rate": 7.925884176412568e-08,
|
|
"loss": 1.6679,
|
|
"mean_token_accuracy": 0.6516667544841767,
|
|
"num_tokens": 1331485060.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 1.68203125,
|
|
"epoch": 1.970753040863742,
|
|
"grad_norm": 0.18547383768566283,
|
|
"learning_rate": 7.573622657460899e-08,
|
|
"loss": 1.6953,
|
|
"mean_token_accuracy": 0.6488840341567993,
|
|
"num_tokens": 1332406372.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 1.55546875,
|
|
"epoch": 1.972119721197212,
|
|
"grad_norm": 0.2222510963006708,
|
|
"learning_rate": 7.22136113850923e-08,
|
|
"loss": 1.551,
|
|
"mean_token_accuracy": 0.6723632216453552,
|
|
"num_tokens": 1333339405.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 1.5703125,
|
|
"epoch": 1.9734864015306819,
|
|
"grad_norm": 0.18142694773412663,
|
|
"learning_rate": 6.869099619557561e-08,
|
|
"loss": 1.5783,
|
|
"mean_token_accuracy": 0.6658006429672241,
|
|
"num_tokens": 1334260544.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 1.5859375,
|
|
"epoch": 1.9748530818641519,
|
|
"grad_norm": 0.2624154007061923,
|
|
"learning_rate": 6.51683810060589e-08,
|
|
"loss": 1.6046,
|
|
"mean_token_accuracy": 0.6611201465129852,
|
|
"num_tokens": 1335220340.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 1.66875,
|
|
"epoch": 1.9762197621976219,
|
|
"grad_norm": 0.18673574679669735,
|
|
"learning_rate": 6.164576581654221e-08,
|
|
"loss": 1.6774,
|
|
"mean_token_accuracy": 0.6504150569438935,
|
|
"num_tokens": 1336106039.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 1.60234375,
|
|
"epoch": 1.9775864425310918,
|
|
"grad_norm": 0.27281093041440907,
|
|
"learning_rate": 5.8123150627025515e-08,
|
|
"loss": 1.6115,
|
|
"mean_token_accuracy": 0.6602864801883698,
|
|
"num_tokens": 1337004565.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 1.58046875,
|
|
"epoch": 1.9789531228645618,
|
|
"grad_norm": 0.19384942786890652,
|
|
"learning_rate": 5.460053543750881e-08,
|
|
"loss": 1.5941,
|
|
"mean_token_accuracy": 0.6649617552757263,
|
|
"num_tokens": 1337868049.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 1.66484375,
|
|
"epoch": 1.980319803198032,
|
|
"grad_norm": 0.16593860361325952,
|
|
"learning_rate": 5.107792024799211e-08,
|
|
"loss": 1.6845,
|
|
"mean_token_accuracy": 0.6499240815639495,
|
|
"num_tokens": 1338787654.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 1.6859375,
|
|
"epoch": 1.981686483531502,
|
|
"grad_norm": 0.21803654095704345,
|
|
"learning_rate": 4.7555305058475415e-08,
|
|
"loss": 1.6961,
|
|
"mean_token_accuracy": 0.6473641037940979,
|
|
"num_tokens": 1339672752.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 1.6125,
|
|
"epoch": 1.983053163864972,
|
|
"grad_norm": 0.24951193815167136,
|
|
"learning_rate": 4.403268986895872e-08,
|
|
"loss": 1.6402,
|
|
"mean_token_accuracy": 0.6575801432132721,
|
|
"num_tokens": 1340639636.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 1.659375,
|
|
"epoch": 1.984419844198442,
|
|
"grad_norm": 0.21379001570601336,
|
|
"learning_rate": 4.0510074679442026e-08,
|
|
"loss": 1.6781,
|
|
"mean_token_accuracy": 0.6507704377174377,
|
|
"num_tokens": 1341565600.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 1.6078125,
|
|
"epoch": 1.985786524531912,
|
|
"grad_norm": 0.18747707592812277,
|
|
"learning_rate": 3.698745948992532e-08,
|
|
"loss": 1.604,
|
|
"mean_token_accuracy": 0.6605421245098114,
|
|
"num_tokens": 1342423440.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 1.5578125,
|
|
"epoch": 1.987153204865382,
|
|
"grad_norm": 0.2610724189341316,
|
|
"learning_rate": 3.3464844300408624e-08,
|
|
"loss": 1.5608,
|
|
"mean_token_accuracy": 0.6659642636775971,
|
|
"num_tokens": 1343301104.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 1.57109375,
|
|
"epoch": 1.9885198851988521,
|
|
"grad_norm": 0.1774807705581725,
|
|
"learning_rate": 2.994222911089193e-08,
|
|
"loss": 1.5674,
|
|
"mean_token_accuracy": 0.6700686573982239,
|
|
"num_tokens": 1344252664.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 1.62890625,
|
|
"epoch": 1.9898865655323221,
|
|
"grad_norm": 0.3343532701330076,
|
|
"learning_rate": 2.6419613921375232e-08,
|
|
"loss": 1.6452,
|
|
"mean_token_accuracy": 0.6564302563667297,
|
|
"num_tokens": 1345208173.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 1.59296875,
|
|
"epoch": 1.991253245865792,
|
|
"grad_norm": 0.40495512153110874,
|
|
"learning_rate": 2.289699873185853e-08,
|
|
"loss": 1.5965,
|
|
"mean_token_accuracy": 0.6600158274173736,
|
|
"num_tokens": 1346125524.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 1.67734375,
|
|
"epoch": 1.992619926199262,
|
|
"grad_norm": 0.15487343600401676,
|
|
"learning_rate": 1.9374383542341837e-08,
|
|
"loss": 1.6884,
|
|
"mean_token_accuracy": 0.6477705538272858,
|
|
"num_tokens": 1347011012.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 1.55234375,
|
|
"epoch": 1.993986606532732,
|
|
"grad_norm": 0.24036251932630182,
|
|
"learning_rate": 1.585176835282514e-08,
|
|
"loss": 1.5391,
|
|
"mean_token_accuracy": 0.6736408412456513,
|
|
"num_tokens": 1347934129.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 1.71640625,
|
|
"epoch": 1.995353286866202,
|
|
"grad_norm": 0.19575585642943658,
|
|
"learning_rate": 1.2329153163308442e-08,
|
|
"loss": 1.7273,
|
|
"mean_token_accuracy": 0.6444430947303772,
|
|
"num_tokens": 1348871663.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 1.60546875,
|
|
"epoch": 1.996719967199672,
|
|
"grad_norm": 0.17694854060816156,
|
|
"learning_rate": 8.806537973791744e-09,
|
|
"loss": 1.6083,
|
|
"mean_token_accuracy": 0.6613716542720794,
|
|
"num_tokens": 1349832359.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 1.6203125,
|
|
"epoch": 1.998086647533142,
|
|
"grad_norm": 0.18683027120141105,
|
|
"learning_rate": 5.2839227842750465e-09,
|
|
"loss": 1.6367,
|
|
"mean_token_accuracy": 0.6563838243484497,
|
|
"num_tokens": 1350784913.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 1.62109375,
|
|
"epoch": 1.999453327866612,
|
|
"grad_norm": 0.20547140511314427,
|
|
"learning_rate": 1.7613075947583486e-09,
|
|
"loss": 1.6102,
|
|
"mean_token_accuracy": 0.6610078155994416,
|
|
"num_tokens": 1351710592.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 1.6328125,
|
|
"epoch": 2.0,
|
|
"mean_token_accuracy": 0.6548221111297607,
|
|
"num_tokens": 1352093289.0,
|
|
"step": 14634,
|
|
"total_flos": 2710644286685184.0,
|
|
"train_loss": 1.7033782835881275,
|
|
"train_runtime": 26263.5838,
|
|
"train_samples_per_second": 71.316,
|
|
"train_steps_per_second": 0.557
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 14634,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 2,
|
|
"save_steps": 200,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 2710644286685184.0,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|