7664 lines
216 KiB
JSON
7664 lines
216 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 762,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 0.57855224609375,
|
||
|
|
"epoch": 0.003937007874015748,
|
||
|
|
"grad_norm": 6.017421193838227,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.3958,
|
||
|
|
"mean_token_accuracy": 0.6527928654104471,
|
||
|
|
"num_tokens": 405091.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.57177734375,
|
||
|
|
"epoch": 0.007874015748031496,
|
||
|
|
"grad_norm": 5.993264916340701,
|
||
|
|
"learning_rate": 5.128205128205128e-07,
|
||
|
|
"loss": 1.4236,
|
||
|
|
"mean_token_accuracy": 0.6467109536752105,
|
||
|
|
"num_tokens": 829295.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.586273193359375,
|
||
|
|
"epoch": 0.011811023622047244,
|
||
|
|
"grad_norm": 5.967015920534343,
|
||
|
|
"learning_rate": 1.0256410256410257e-06,
|
||
|
|
"loss": 1.4081,
|
||
|
|
"mean_token_accuracy": 0.6507676001638174,
|
||
|
|
"num_tokens": 1234884.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.575836181640625,
|
||
|
|
"epoch": 0.015748031496062992,
|
||
|
|
"grad_norm": 5.845522648758348,
|
||
|
|
"learning_rate": 1.5384615384615387e-06,
|
||
|
|
"loss": 1.3926,
|
||
|
|
"mean_token_accuracy": 0.6580017423257232,
|
||
|
|
"num_tokens": 1629773.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.56964111328125,
|
||
|
|
"epoch": 0.01968503937007874,
|
||
|
|
"grad_norm": 5.790573559158371,
|
||
|
|
"learning_rate": 2.0512820512820513e-06,
|
||
|
|
"loss": 1.3868,
|
||
|
|
"mean_token_accuracy": 0.657335345633328,
|
||
|
|
"num_tokens": 2041925.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.56451416015625,
|
||
|
|
"epoch": 0.023622047244094488,
|
||
|
|
"grad_norm": 5.568170796627478,
|
||
|
|
"learning_rate": 2.564102564102564e-06,
|
||
|
|
"loss": 1.3857,
|
||
|
|
"mean_token_accuracy": 0.6550841787829995,
|
||
|
|
"num_tokens": 2489592.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.568939208984375,
|
||
|
|
"epoch": 0.027559055118110236,
|
||
|
|
"grad_norm": 5.1196996584808865,
|
||
|
|
"learning_rate": 3.0769230769230774e-06,
|
||
|
|
"loss": 1.3463,
|
||
|
|
"mean_token_accuracy": 0.6607445329427719,
|
||
|
|
"num_tokens": 2911373.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.573883056640625,
|
||
|
|
"epoch": 0.031496062992125984,
|
||
|
|
"grad_norm": 4.493954294279661,
|
||
|
|
"learning_rate": 3.58974358974359e-06,
|
||
|
|
"loss": 1.3163,
|
||
|
|
"mean_token_accuracy": 0.6666090982034802,
|
||
|
|
"num_tokens": 3345237.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.582183837890625,
|
||
|
|
"epoch": 0.03543307086614173,
|
||
|
|
"grad_norm": 4.315469270405331,
|
||
|
|
"learning_rate": 4.102564102564103e-06,
|
||
|
|
"loss": 1.2769,
|
||
|
|
"mean_token_accuracy": 0.6745583917945623,
|
||
|
|
"num_tokens": 3754911.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.564117431640625,
|
||
|
|
"epoch": 0.03937007874015748,
|
||
|
|
"grad_norm": 3.6044990712171083,
|
||
|
|
"learning_rate": 4.615384615384616e-06,
|
||
|
|
"loss": 1.1654,
|
||
|
|
"mean_token_accuracy": 0.6969763962551951,
|
||
|
|
"num_tokens": 4180069.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5650634765625,
|
||
|
|
"epoch": 0.04330708661417323,
|
||
|
|
"grad_norm": 3.34006760349023,
|
||
|
|
"learning_rate": 5.128205128205128e-06,
|
||
|
|
"loss": 1.1559,
|
||
|
|
"mean_token_accuracy": 0.6937189754098654,
|
||
|
|
"num_tokens": 4610750.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55633544921875,
|
||
|
|
"epoch": 0.047244094488188976,
|
||
|
|
"grad_norm": 3.2756727794133242,
|
||
|
|
"learning_rate": 5.641025641025641e-06,
|
||
|
|
"loss": 1.1369,
|
||
|
|
"mean_token_accuracy": 0.6968221105635166,
|
||
|
|
"num_tokens": 5055896.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53607177734375,
|
||
|
|
"epoch": 0.051181102362204724,
|
||
|
|
"grad_norm": 4.3300668215494555,
|
||
|
|
"learning_rate": 6.153846153846155e-06,
|
||
|
|
"loss": 1.0162,
|
||
|
|
"mean_token_accuracy": 0.720275528728962,
|
||
|
|
"num_tokens": 5468686.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.54345703125,
|
||
|
|
"epoch": 0.05511811023622047,
|
||
|
|
"grad_norm": 4.5925042609040645,
|
||
|
|
"learning_rate": 6.666666666666667e-06,
|
||
|
|
"loss": 1.0286,
|
||
|
|
"mean_token_accuracy": 0.7187379905954003,
|
||
|
|
"num_tokens": 5886307.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.539398193359375,
|
||
|
|
"epoch": 0.05905511811023622,
|
||
|
|
"grad_norm": 3.8670134026701626,
|
||
|
|
"learning_rate": 7.17948717948718e-06,
|
||
|
|
"loss": 0.9893,
|
||
|
|
"mean_token_accuracy": 0.7269825097173452,
|
||
|
|
"num_tokens": 6325637.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.538116455078125,
|
||
|
|
"epoch": 0.06299212598425197,
|
||
|
|
"grad_norm": 3.3424168128794434,
|
||
|
|
"learning_rate": 7.692307692307694e-06,
|
||
|
|
"loss": 0.9672,
|
||
|
|
"mean_token_accuracy": 0.7305122185498476,
|
||
|
|
"num_tokens": 6766914.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.55633544921875,
|
||
|
|
"epoch": 0.06692913385826772,
|
||
|
|
"grad_norm": 3.7699936932143077,
|
||
|
|
"learning_rate": 8.205128205128205e-06,
|
||
|
|
"loss": 0.9261,
|
||
|
|
"mean_token_accuracy": 0.7375357151031494,
|
||
|
|
"num_tokens": 7185216.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5638427734375,
|
||
|
|
"epoch": 0.07086614173228346,
|
||
|
|
"grad_norm": 4.148747463938917,
|
||
|
|
"learning_rate": 8.717948717948719e-06,
|
||
|
|
"loss": 0.9387,
|
||
|
|
"mean_token_accuracy": 0.7338155778124928,
|
||
|
|
"num_tokens": 7596226.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5389404296875,
|
||
|
|
"epoch": 0.07480314960629922,
|
||
|
|
"grad_norm": 4.006513472613672,
|
||
|
|
"learning_rate": 9.230769230769232e-06,
|
||
|
|
"loss": 0.907,
|
||
|
|
"mean_token_accuracy": 0.7441706955432892,
|
||
|
|
"num_tokens": 8031675.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.550994873046875,
|
||
|
|
"epoch": 0.07874015748031496,
|
||
|
|
"grad_norm": 3.0586688382376694,
|
||
|
|
"learning_rate": 9.743589743589744e-06,
|
||
|
|
"loss": 0.8915,
|
||
|
|
"mean_token_accuracy": 0.7450471529737115,
|
||
|
|
"num_tokens": 8463875.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5362548828125,
|
||
|
|
"epoch": 0.08267716535433071,
|
||
|
|
"grad_norm": 3.010274397022863,
|
||
|
|
"learning_rate": 1.0256410256410256e-05,
|
||
|
|
"loss": 0.8649,
|
||
|
|
"mean_token_accuracy": 0.749404520727694,
|
||
|
|
"num_tokens": 8891456.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.534088134765625,
|
||
|
|
"epoch": 0.08661417322834646,
|
||
|
|
"grad_norm": 2.5938467957096667,
|
||
|
|
"learning_rate": 1.076923076923077e-05,
|
||
|
|
"loss": 0.8574,
|
||
|
|
"mean_token_accuracy": 0.7520366236567497,
|
||
|
|
"num_tokens": 9346156.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.542938232421875,
|
||
|
|
"epoch": 0.09055118110236221,
|
||
|
|
"grad_norm": 2.2126973604191456,
|
||
|
|
"learning_rate": 1.1282051282051283e-05,
|
||
|
|
"loss": 0.8397,
|
||
|
|
"mean_token_accuracy": 0.7569433562457561,
|
||
|
|
"num_tokens": 9774883.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.53131103515625,
|
||
|
|
"epoch": 0.09448818897637795,
|
||
|
|
"grad_norm": 2.6706476920762214,
|
||
|
|
"learning_rate": 1.1794871794871796e-05,
|
||
|
|
"loss": 0.8108,
|
||
|
|
"mean_token_accuracy": 0.7596273683011532,
|
||
|
|
"num_tokens": 10201146.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.537384033203125,
|
||
|
|
"epoch": 0.0984251968503937,
|
||
|
|
"grad_norm": 2.2269608876043154,
|
||
|
|
"learning_rate": 1.230769230769231e-05,
|
||
|
|
"loss": 0.7871,
|
||
|
|
"mean_token_accuracy": 0.7649025870487094,
|
||
|
|
"num_tokens": 10610203.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.531219482421875,
|
||
|
|
"epoch": 0.10236220472440945,
|
||
|
|
"grad_norm": 2.170252611622614,
|
||
|
|
"learning_rate": 1.2820512820512823e-05,
|
||
|
|
"loss": 0.7906,
|
||
|
|
"mean_token_accuracy": 0.7660603849217296,
|
||
|
|
"num_tokens": 11068101.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52508544921875,
|
||
|
|
"epoch": 0.1062992125984252,
|
||
|
|
"grad_norm": 2.242670995581003,
|
||
|
|
"learning_rate": 1.3333333333333333e-05,
|
||
|
|
"loss": 0.7748,
|
||
|
|
"mean_token_accuracy": 0.7676612958312035,
|
||
|
|
"num_tokens": 11493402.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.513397216796875,
|
||
|
|
"epoch": 0.11023622047244094,
|
||
|
|
"grad_norm": 1.7608050419499566,
|
||
|
|
"learning_rate": 1.3846153846153847e-05,
|
||
|
|
"loss": 0.7437,
|
||
|
|
"mean_token_accuracy": 0.7780561083927751,
|
||
|
|
"num_tokens": 11933204.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.510833740234375,
|
||
|
|
"epoch": 0.1141732283464567,
|
||
|
|
"grad_norm": 1.6033916769131809,
|
||
|
|
"learning_rate": 1.435897435897436e-05,
|
||
|
|
"loss": 0.7334,
|
||
|
|
"mean_token_accuracy": 0.7764604520052671,
|
||
|
|
"num_tokens": 12380898.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.52008056640625,
|
||
|
|
"epoch": 0.11811023622047244,
|
||
|
|
"grad_norm": 1.80791166592839,
|
||
|
|
"learning_rate": 1.4871794871794874e-05,
|
||
|
|
"loss": 0.7291,
|
||
|
|
"mean_token_accuracy": 0.7800389584153891,
|
||
|
|
"num_tokens": 12802897.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.518524169921875,
|
||
|
|
"epoch": 0.1220472440944882,
|
||
|
|
"grad_norm": 1.6689890484137595,
|
||
|
|
"learning_rate": 1.5384615384615387e-05,
|
||
|
|
"loss": 0.7372,
|
||
|
|
"mean_token_accuracy": 0.7804073309525847,
|
||
|
|
"num_tokens": 13239071.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.5186767578125,
|
||
|
|
"epoch": 0.12598425196850394,
|
||
|
|
"grad_norm": 1.71311066387976,
|
||
|
|
"learning_rate": 1.5897435897435897e-05,
|
||
|
|
"loss": 0.7011,
|
||
|
|
"mean_token_accuracy": 0.7851214902475476,
|
||
|
|
"num_tokens": 13671846.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.517120361328125,
|
||
|
|
"epoch": 0.12992125984251968,
|
||
|
|
"grad_norm": 1.653752860519691,
|
||
|
|
"learning_rate": 1.641025641025641e-05,
|
||
|
|
"loss": 0.6941,
|
||
|
|
"mean_token_accuracy": 0.7865419248118997,
|
||
|
|
"num_tokens": 14096702.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.51727294921875,
|
||
|
|
"epoch": 0.13385826771653545,
|
||
|
|
"grad_norm": 1.8994101238336887,
|
||
|
|
"learning_rate": 1.6923076923076924e-05,
|
||
|
|
"loss": 0.6762,
|
||
|
|
"mean_token_accuracy": 0.787972204387188,
|
||
|
|
"num_tokens": 14507141.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50054931640625,
|
||
|
|
"epoch": 0.1377952755905512,
|
||
|
|
"grad_norm": 1.8005803715283153,
|
||
|
|
"learning_rate": 1.7435897435897438e-05,
|
||
|
|
"loss": 0.6678,
|
||
|
|
"mean_token_accuracy": 0.7918295972049236,
|
||
|
|
"num_tokens": 14924605.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.498931884765625,
|
||
|
|
"epoch": 0.14173228346456693,
|
||
|
|
"grad_norm": 1.7427723538583522,
|
||
|
|
"learning_rate": 1.794871794871795e-05,
|
||
|
|
"loss": 0.6568,
|
||
|
|
"mean_token_accuracy": 0.7931245760992169,
|
||
|
|
"num_tokens": 15340274.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50238037109375,
|
||
|
|
"epoch": 0.14566929133858267,
|
||
|
|
"grad_norm": 1.9022526038852137,
|
||
|
|
"learning_rate": 1.8461538461538465e-05,
|
||
|
|
"loss": 0.6598,
|
||
|
|
"mean_token_accuracy": 0.7942408351227641,
|
||
|
|
"num_tokens": 15760102.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.505035400390625,
|
||
|
|
"epoch": 0.14960629921259844,
|
||
|
|
"grad_norm": 1.9173596829441977,
|
||
|
|
"learning_rate": 1.8974358974358975e-05,
|
||
|
|
"loss": 0.6603,
|
||
|
|
"mean_token_accuracy": 0.7934675076976418,
|
||
|
|
"num_tokens": 16168355.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.50341796875,
|
||
|
|
"epoch": 0.15354330708661418,
|
||
|
|
"grad_norm": 1.737835763355968,
|
||
|
|
"learning_rate": 1.9487179487179488e-05,
|
||
|
|
"loss": 0.6678,
|
||
|
|
"mean_token_accuracy": 0.7944545326754451,
|
||
|
|
"num_tokens": 16616270.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.504364013671875,
|
||
|
|
"epoch": 0.15748031496062992,
|
||
|
|
"grad_norm": 1.6371465461053467,
|
||
|
|
"learning_rate": 2e-05,
|
||
|
|
"loss": 0.6598,
|
||
|
|
"mean_token_accuracy": 0.7918455330654979,
|
||
|
|
"num_tokens": 17047860.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4927978515625,
|
||
|
|
"epoch": 0.16141732283464566,
|
||
|
|
"grad_norm": 1.6621682164769442,
|
||
|
|
"learning_rate": 1.999990559554882e-05,
|
||
|
|
"loss": 0.6623,
|
||
|
|
"mean_token_accuracy": 0.792108066380024,
|
||
|
|
"num_tokens": 17485177.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.496307373046875,
|
||
|
|
"epoch": 0.16535433070866143,
|
||
|
|
"grad_norm": 1.8543495232878142,
|
||
|
|
"learning_rate": 1.9999622383977725e-05,
|
||
|
|
"loss": 0.6467,
|
||
|
|
"mean_token_accuracy": 0.797334254719317,
|
||
|
|
"num_tokens": 17910948.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.490631103515625,
|
||
|
|
"epoch": 0.16929133858267717,
|
||
|
|
"grad_norm": 1.8574036554957438,
|
||
|
|
"learning_rate": 1.9999150370633987e-05,
|
||
|
|
"loss": 0.646,
|
||
|
|
"mean_token_accuracy": 0.7992923380807042,
|
||
|
|
"num_tokens": 18347007.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.503265380859375,
|
||
|
|
"epoch": 0.1732283464566929,
|
||
|
|
"grad_norm": 1.4203675083785523,
|
||
|
|
"learning_rate": 1.9998489564429656e-05,
|
||
|
|
"loss": 0.6524,
|
||
|
|
"mean_token_accuracy": 0.7955402638763189,
|
||
|
|
"num_tokens": 18770125.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4832763671875,
|
||
|
|
"epoch": 0.17716535433070865,
|
||
|
|
"grad_norm": 1.699095417852413,
|
||
|
|
"learning_rate": 1.999763997784133e-05,
|
||
|
|
"loss": 0.6559,
|
||
|
|
"mean_token_accuracy": 0.7955031348392367,
|
||
|
|
"num_tokens": 19226466.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478118896484375,
|
||
|
|
"epoch": 0.18110236220472442,
|
||
|
|
"grad_norm": 1.5920566823332218,
|
||
|
|
"learning_rate": 1.9996601626909962e-05,
|
||
|
|
"loss": 0.6405,
|
||
|
|
"mean_token_accuracy": 0.7994664330035448,
|
||
|
|
"num_tokens": 19670619.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4974365234375,
|
||
|
|
"epoch": 0.18503937007874016,
|
||
|
|
"grad_norm": 1.332039937086849,
|
||
|
|
"learning_rate": 1.999537453124055e-05,
|
||
|
|
"loss": 0.6231,
|
||
|
|
"mean_token_accuracy": 0.8019787659868598,
|
||
|
|
"num_tokens": 20079141.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49603271484375,
|
||
|
|
"epoch": 0.1889763779527559,
|
||
|
|
"grad_norm": 1.5722338994254577,
|
||
|
|
"learning_rate": 1.9993958714001738e-05,
|
||
|
|
"loss": 0.6262,
|
||
|
|
"mean_token_accuracy": 0.8007524479180574,
|
||
|
|
"num_tokens": 20519574.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4832763671875,
|
||
|
|
"epoch": 0.19291338582677164,
|
||
|
|
"grad_norm": 1.4266430941019363,
|
||
|
|
"learning_rate": 1.9992354201925427e-05,
|
||
|
|
"loss": 0.6217,
|
||
|
|
"mean_token_accuracy": 0.8023361451923847,
|
||
|
|
"num_tokens": 20947519.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4827880859375,
|
||
|
|
"epoch": 0.1968503937007874,
|
||
|
|
"grad_norm": 1.7573722850889697,
|
||
|
|
"learning_rate": 1.9990561025306232e-05,
|
||
|
|
"loss": 0.6224,
|
||
|
|
"mean_token_accuracy": 0.8040017504245043,
|
||
|
|
"num_tokens": 21392560.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4984130859375,
|
||
|
|
"epoch": 0.20078740157480315,
|
||
|
|
"grad_norm": 1.3813190235222876,
|
||
|
|
"learning_rate": 1.998857921800092e-05,
|
||
|
|
"loss": 0.6275,
|
||
|
|
"mean_token_accuracy": 0.80109344702214,
|
||
|
|
"num_tokens": 21833277.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49365234375,
|
||
|
|
"epoch": 0.2047244094488189,
|
||
|
|
"grad_norm": 1.4982744826386218,
|
||
|
|
"learning_rate": 1.998640881742778e-05,
|
||
|
|
"loss": 0.6005,
|
||
|
|
"mean_token_accuracy": 0.8037025630474091,
|
||
|
|
"num_tokens": 22257152.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.478729248046875,
|
||
|
|
"epoch": 0.20866141732283464,
|
||
|
|
"grad_norm": 1.4345392282851195,
|
||
|
|
"learning_rate": 1.998404986456591e-05,
|
||
|
|
"loss": 0.5978,
|
||
|
|
"mean_token_accuracy": 0.8084583384916186,
|
||
|
|
"num_tokens": 22686084.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49456787109375,
|
||
|
|
"epoch": 0.2125984251968504,
|
||
|
|
"grad_norm": 1.3302730274926833,
|
||
|
|
"learning_rate": 1.9981502403954435e-05,
|
||
|
|
"loss": 0.5817,
|
||
|
|
"mean_token_accuracy": 0.8112034667283297,
|
||
|
|
"num_tokens": 23097849.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.49212646484375,
|
||
|
|
"epoch": 0.21653543307086615,
|
||
|
|
"grad_norm": 1.5182591089716235,
|
||
|
|
"learning_rate": 1.997876648369168e-05,
|
||
|
|
"loss": 0.6066,
|
||
|
|
"mean_token_accuracy": 0.8056635642424226,
|
||
|
|
"num_tokens": 23540073.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.479339599609375,
|
||
|
|
"epoch": 0.2204724409448819,
|
||
|
|
"grad_norm": 1.471676822279833,
|
||
|
|
"learning_rate": 1.9975842155434253e-05,
|
||
|
|
"loss": 0.6004,
|
||
|
|
"mean_token_accuracy": 0.806885845027864,
|
||
|
|
"num_tokens": 23983334.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472015380859375,
|
||
|
|
"epoch": 0.22440944881889763,
|
||
|
|
"grad_norm": 1.7984365891799328,
|
||
|
|
"learning_rate": 1.997272947439608e-05,
|
||
|
|
"loss": 0.5887,
|
||
|
|
"mean_token_accuracy": 0.8096871245652437,
|
||
|
|
"num_tokens": 24429322.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4752197265625,
|
||
|
|
"epoch": 0.2283464566929134,
|
||
|
|
"grad_norm": 1.4009385017224354,
|
||
|
|
"learning_rate": 1.996942849934735e-05,
|
||
|
|
"loss": 0.6013,
|
||
|
|
"mean_token_accuracy": 0.8048065342009068,
|
||
|
|
"num_tokens": 24871384.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47406005859375,
|
||
|
|
"epoch": 0.23228346456692914,
|
||
|
|
"grad_norm": 1.4868806329344062,
|
||
|
|
"learning_rate": 1.9965939292613408e-05,
|
||
|
|
"loss": 0.5783,
|
||
|
|
"mean_token_accuracy": 0.8113819938153028,
|
||
|
|
"num_tokens": 25319061.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.474151611328125,
|
||
|
|
"epoch": 0.23622047244094488,
|
||
|
|
"grad_norm": 1.3967298894617224,
|
||
|
|
"learning_rate": 1.996226192007358e-05,
|
||
|
|
"loss": 0.58,
|
||
|
|
"mean_token_accuracy": 0.8119395393878222,
|
||
|
|
"num_tokens": 25766012.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4749755859375,
|
||
|
|
"epoch": 0.24015748031496062,
|
||
|
|
"grad_norm": 1.4262112118632393,
|
||
|
|
"learning_rate": 1.9958396451159937e-05,
|
||
|
|
"loss": 0.5713,
|
||
|
|
"mean_token_accuracy": 0.8120222119614482,
|
||
|
|
"num_tokens": 26170408.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.48187255859375,
|
||
|
|
"epoch": 0.2440944881889764,
|
||
|
|
"grad_norm": 1.505498450930936,
|
||
|
|
"learning_rate": 1.995434295885598e-05,
|
||
|
|
"loss": 0.5727,
|
||
|
|
"mean_token_accuracy": 0.8128972761332989,
|
||
|
|
"num_tokens": 26585896.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47503662109375,
|
||
|
|
"epoch": 0.24803149606299213,
|
||
|
|
"grad_norm": 1.457370555253117,
|
||
|
|
"learning_rate": 1.995010151969524e-05,
|
||
|
|
"loss": 0.5814,
|
||
|
|
"mean_token_accuracy": 0.8106799507513642,
|
||
|
|
"num_tokens": 27022289.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.481109619140625,
|
||
|
|
"epoch": 0.25196850393700787,
|
||
|
|
"grad_norm": 1.3774207441827904,
|
||
|
|
"learning_rate": 1.9945672213759872e-05,
|
||
|
|
"loss": 0.5924,
|
||
|
|
"mean_token_accuracy": 0.8066158397123218,
|
||
|
|
"num_tokens": 27468846.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47772216796875,
|
||
|
|
"epoch": 0.2559055118110236,
|
||
|
|
"grad_norm": 1.6141808039785952,
|
||
|
|
"learning_rate": 1.9941055124679108e-05,
|
||
|
|
"loss": 0.5868,
|
||
|
|
"mean_token_accuracy": 0.8115468313917518,
|
||
|
|
"num_tokens": 27905936.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.481658935546875,
|
||
|
|
"epoch": 0.25984251968503935,
|
||
|
|
"grad_norm": 1.4300524182611083,
|
||
|
|
"learning_rate": 1.993625033962771e-05,
|
||
|
|
"loss": 0.5921,
|
||
|
|
"mean_token_accuracy": 0.8093285923823714,
|
||
|
|
"num_tokens": 28353036.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4754638671875,
|
||
|
|
"epoch": 0.2637795275590551,
|
||
|
|
"grad_norm": 1.4415181715635927,
|
||
|
|
"learning_rate": 1.993125794932429e-05,
|
||
|
|
"loss": 0.5804,
|
||
|
|
"mean_token_accuracy": 0.8094886504113674,
|
||
|
|
"num_tokens": 28792395.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4808349609375,
|
||
|
|
"epoch": 0.2677165354330709,
|
||
|
|
"grad_norm": 1.5030670439825435,
|
||
|
|
"learning_rate": 1.9926078048029623e-05,
|
||
|
|
"loss": 0.5889,
|
||
|
|
"mean_token_accuracy": 0.8070614328607917,
|
||
|
|
"num_tokens": 29202926.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468170166015625,
|
||
|
|
"epoch": 0.27165354330708663,
|
||
|
|
"grad_norm": 1.463016395641609,
|
||
|
|
"learning_rate": 1.992071073354486e-05,
|
||
|
|
"loss": 0.5804,
|
||
|
|
"mean_token_accuracy": 0.8089981079101562,
|
||
|
|
"num_tokens": 29633215.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472808837890625,
|
||
|
|
"epoch": 0.2755905511811024,
|
||
|
|
"grad_norm": 1.2691410832084924,
|
||
|
|
"learning_rate": 1.9915156107209673e-05,
|
||
|
|
"loss": 0.5654,
|
||
|
|
"mean_token_accuracy": 0.8132820166647434,
|
||
|
|
"num_tokens": 30053469.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4683837890625,
|
||
|
|
"epoch": 0.2795275590551181,
|
||
|
|
"grad_norm": 1.35929946128328,
|
||
|
|
"learning_rate": 1.9909414273900353e-05,
|
||
|
|
"loss": 0.5827,
|
||
|
|
"mean_token_accuracy": 0.8104306925088167,
|
||
|
|
"num_tokens": 30508051.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.469024658203125,
|
||
|
|
"epoch": 0.28346456692913385,
|
||
|
|
"grad_norm": 1.314977800443806,
|
||
|
|
"learning_rate": 1.9903485342027827e-05,
|
||
|
|
"loss": 0.5563,
|
||
|
|
"mean_token_accuracy": 0.8167074229568243,
|
||
|
|
"num_tokens": 30946214.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.481658935546875,
|
||
|
|
"epoch": 0.2874015748031496,
|
||
|
|
"grad_norm": 1.2406296758259667,
|
||
|
|
"learning_rate": 1.98973694235356e-05,
|
||
|
|
"loss": 0.5642,
|
||
|
|
"mean_token_accuracy": 0.8149285055696964,
|
||
|
|
"num_tokens": 31363171.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467681884765625,
|
||
|
|
"epoch": 0.29133858267716534,
|
||
|
|
"grad_norm": 1.2216342233110984,
|
||
|
|
"learning_rate": 1.9891066633897666e-05,
|
||
|
|
"loss": 0.555,
|
||
|
|
"mean_token_accuracy": 0.8174580186605453,
|
||
|
|
"num_tokens": 31771585.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4698486328125,
|
||
|
|
"epoch": 0.2952755905511811,
|
||
|
|
"grad_norm": 1.2181141707893957,
|
||
|
|
"learning_rate": 1.9884577092116296e-05,
|
||
|
|
"loss": 0.5565,
|
||
|
|
"mean_token_accuracy": 0.8176529305055737,
|
||
|
|
"num_tokens": 32178646.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.459503173828125,
|
||
|
|
"epoch": 0.2992125984251969,
|
||
|
|
"grad_norm": 1.3696165596387524,
|
||
|
|
"learning_rate": 1.9877900920719825e-05,
|
||
|
|
"loss": 0.5627,
|
||
|
|
"mean_token_accuracy": 0.813455811701715,
|
||
|
|
"num_tokens": 32609106.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468231201171875,
|
||
|
|
"epoch": 0.3031496062992126,
|
||
|
|
"grad_norm": 1.2372308799624376,
|
||
|
|
"learning_rate": 1.9871038245760305e-05,
|
||
|
|
"loss": 0.5708,
|
||
|
|
"mean_token_accuracy": 0.8143888600170612,
|
||
|
|
"num_tokens": 33037253.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47723388671875,
|
||
|
|
"epoch": 0.30708661417322836,
|
||
|
|
"grad_norm": 1.3187950742682812,
|
||
|
|
"learning_rate": 1.9863989196811153e-05,
|
||
|
|
"loss": 0.5671,
|
||
|
|
"mean_token_accuracy": 0.8115543182939291,
|
||
|
|
"num_tokens": 33447162.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.463226318359375,
|
||
|
|
"epoch": 0.3110236220472441,
|
||
|
|
"grad_norm": 1.2233802262332847,
|
||
|
|
"learning_rate": 1.9856753906964686e-05,
|
||
|
|
"loss": 0.5571,
|
||
|
|
"mean_token_accuracy": 0.8157372018322349,
|
||
|
|
"num_tokens": 33878990.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.469940185546875,
|
||
|
|
"epoch": 0.31496062992125984,
|
||
|
|
"grad_norm": 1.185901461351517,
|
||
|
|
"learning_rate": 1.9849332512829624e-05,
|
||
|
|
"loss": 0.5568,
|
||
|
|
"mean_token_accuracy": 0.8197338776662946,
|
||
|
|
"num_tokens": 34304647.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.465545654296875,
|
||
|
|
"epoch": 0.3188976377952756,
|
||
|
|
"grad_norm": 1.4246612321853744,
|
||
|
|
"learning_rate": 1.9841725154528485e-05,
|
||
|
|
"loss": 0.5829,
|
||
|
|
"mean_token_accuracy": 0.8095870474353433,
|
||
|
|
"num_tokens": 34750816.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4696044921875,
|
||
|
|
"epoch": 0.3228346456692913,
|
||
|
|
"grad_norm": 1.2185392732273213,
|
||
|
|
"learning_rate": 1.983393197569497e-05,
|
||
|
|
"loss": 0.5585,
|
||
|
|
"mean_token_accuracy": 0.8164394591003656,
|
||
|
|
"num_tokens": 35180424.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4609375,
|
||
|
|
"epoch": 0.32677165354330706,
|
||
|
|
"grad_norm": 1.2799767664999213,
|
||
|
|
"learning_rate": 1.9825953123471235e-05,
|
||
|
|
"loss": 0.5514,
|
||
|
|
"mean_token_accuracy": 0.8194698123261333,
|
||
|
|
"num_tokens": 35639723.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.472259521484375,
|
||
|
|
"epoch": 0.33070866141732286,
|
||
|
|
"grad_norm": 1.2501368772967398,
|
||
|
|
"learning_rate": 1.981778874850511e-05,
|
||
|
|
"loss": 0.5625,
|
||
|
|
"mean_token_accuracy": 0.8149236952885985,
|
||
|
|
"num_tokens": 36075331.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.463348388671875,
|
||
|
|
"epoch": 0.3346456692913386,
|
||
|
|
"grad_norm": 1.3053960236087885,
|
||
|
|
"learning_rate": 1.980943900494727e-05,
|
||
|
|
"loss": 0.5476,
|
||
|
|
"mean_token_accuracy": 0.8175922827795148,
|
||
|
|
"num_tokens": 36513527.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4549560546875,
|
||
|
|
"epoch": 0.33858267716535434,
|
||
|
|
"grad_norm": 1.1850931627983934,
|
||
|
|
"learning_rate": 1.9800904050448296e-05,
|
||
|
|
"loss": 0.5608,
|
||
|
|
"mean_token_accuracy": 0.8153009815141559,
|
||
|
|
"num_tokens": 36982726.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.457763671875,
|
||
|
|
"epoch": 0.3425196850393701,
|
||
|
|
"grad_norm": 1.3097335637467127,
|
||
|
|
"learning_rate": 1.9792184046155733e-05,
|
||
|
|
"loss": 0.5376,
|
||
|
|
"mean_token_accuracy": 0.8229149403050542,
|
||
|
|
"num_tokens": 37406085.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46282958984375,
|
||
|
|
"epoch": 0.3464566929133858,
|
||
|
|
"grad_norm": 1.3701986539966717,
|
||
|
|
"learning_rate": 1.9783279156711022e-05,
|
||
|
|
"loss": 0.5721,
|
||
|
|
"mean_token_accuracy": 0.8138612108305097,
|
||
|
|
"num_tokens": 37849707.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.459503173828125,
|
||
|
|
"epoch": 0.35039370078740156,
|
||
|
|
"grad_norm": 1.324893149036643,
|
||
|
|
"learning_rate": 1.9774189550246407e-05,
|
||
|
|
"loss": 0.5421,
|
||
|
|
"mean_token_accuracy": 0.8197554592043161,
|
||
|
|
"num_tokens": 38283659.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46405029296875,
|
||
|
|
"epoch": 0.3543307086614173,
|
||
|
|
"grad_norm": 1.2919563457717147,
|
||
|
|
"learning_rate": 1.976491539838175e-05,
|
||
|
|
"loss": 0.5467,
|
||
|
|
"mean_token_accuracy": 0.816823348402977,
|
||
|
|
"num_tokens": 38709966.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.462371826171875,
|
||
|
|
"epoch": 0.35826771653543305,
|
||
|
|
"grad_norm": 1.2797648374269206,
|
||
|
|
"learning_rate": 1.975545687622129e-05,
|
||
|
|
"loss": 0.5444,
|
||
|
|
"mean_token_accuracy": 0.8186370227485895,
|
||
|
|
"num_tokens": 39150013.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45489501953125,
|
||
|
|
"epoch": 0.36220472440944884,
|
||
|
|
"grad_norm": 1.4022337956026676,
|
||
|
|
"learning_rate": 1.974581416235035e-05,
|
||
|
|
"loss": 0.5517,
|
||
|
|
"mean_token_accuracy": 0.8169004768133163,
|
||
|
|
"num_tokens": 39576553.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45465087890625,
|
||
|
|
"epoch": 0.3661417322834646,
|
||
|
|
"grad_norm": 1.297870829963053,
|
||
|
|
"learning_rate": 1.9735987438831947e-05,
|
||
|
|
"loss": 0.5429,
|
||
|
|
"mean_token_accuracy": 0.821047849021852,
|
||
|
|
"num_tokens": 40010713.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.450592041015625,
|
||
|
|
"epoch": 0.3700787401574803,
|
||
|
|
"grad_norm": 1.5793780108337674,
|
||
|
|
"learning_rate": 1.972597689120338e-05,
|
||
|
|
"loss": 0.5452,
|
||
|
|
"mean_token_accuracy": 0.8186437683179975,
|
||
|
|
"num_tokens": 40443614.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44610595703125,
|
||
|
|
"epoch": 0.37401574803149606,
|
||
|
|
"grad_norm": 1.2564999884723695,
|
||
|
|
"learning_rate": 1.9715782708472685e-05,
|
||
|
|
"loss": 0.5562,
|
||
|
|
"mean_token_accuracy": 0.8183111995458603,
|
||
|
|
"num_tokens": 40904703.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453094482421875,
|
||
|
|
"epoch": 0.3779527559055118,
|
||
|
|
"grad_norm": 1.2231168223051487,
|
||
|
|
"learning_rate": 1.9705405083115118e-05,
|
||
|
|
"loss": 0.5411,
|
||
|
|
"mean_token_accuracy": 0.8229661779478192,
|
||
|
|
"num_tokens": 41340079.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467529296875,
|
||
|
|
"epoch": 0.38188976377952755,
|
||
|
|
"grad_norm": 1.1470871497193091,
|
||
|
|
"learning_rate": 1.9694844211069477e-05,
|
||
|
|
"loss": 0.5267,
|
||
|
|
"mean_token_accuracy": 0.8263011984527111,
|
||
|
|
"num_tokens": 41744766.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45391845703125,
|
||
|
|
"epoch": 0.3858267716535433,
|
||
|
|
"grad_norm": 1.219783013561985,
|
||
|
|
"learning_rate": 1.9684100291734437e-05,
|
||
|
|
"loss": 0.5404,
|
||
|
|
"mean_token_accuracy": 0.8202607650309801,
|
||
|
|
"num_tokens": 42191460.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.454803466796875,
|
||
|
|
"epoch": 0.38976377952755903,
|
||
|
|
"grad_norm": 1.5385898015342816,
|
||
|
|
"learning_rate": 1.9673173527964753e-05,
|
||
|
|
"loss": 0.5477,
|
||
|
|
"mean_token_accuracy": 0.8171384297311306,
|
||
|
|
"num_tokens": 42627016.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452423095703125,
|
||
|
|
"epoch": 0.3937007874015748,
|
||
|
|
"grad_norm": 1.061036138445293,
|
||
|
|
"learning_rate": 1.966206412606745e-05,
|
||
|
|
"loss": 0.5432,
|
||
|
|
"mean_token_accuracy": 0.820900421589613,
|
||
|
|
"num_tokens": 43064915.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45361328125,
|
||
|
|
"epoch": 0.39763779527559057,
|
||
|
|
"grad_norm": 1.169828618735002,
|
||
|
|
"learning_rate": 1.9650772295797934e-05,
|
||
|
|
"loss": 0.5481,
|
||
|
|
"mean_token_accuracy": 0.8199361320585012,
|
||
|
|
"num_tokens": 43503850.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449493408203125,
|
||
|
|
"epoch": 0.4015748031496063,
|
||
|
|
"grad_norm": 1.2711708641467119,
|
||
|
|
"learning_rate": 1.963929825035601e-05,
|
||
|
|
"loss": 0.5433,
|
||
|
|
"mean_token_accuracy": 0.8212072784081101,
|
||
|
|
"num_tokens": 43933193.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455078125,
|
||
|
|
"epoch": 0.40551181102362205,
|
||
|
|
"grad_norm": 1.2479358738950146,
|
||
|
|
"learning_rate": 1.9627642206381864e-05,
|
||
|
|
"loss": 0.543,
|
||
|
|
"mean_token_accuracy": 0.8205253165215254,
|
||
|
|
"num_tokens": 44356307.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46270751953125,
|
||
|
|
"epoch": 0.4094488188976378,
|
||
|
|
"grad_norm": 1.0684479566223315,
|
||
|
|
"learning_rate": 1.9615804383951992e-05,
|
||
|
|
"loss": 0.5313,
|
||
|
|
"mean_token_accuracy": 0.8274707896634936,
|
||
|
|
"num_tokens": 44764401.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45220947265625,
|
||
|
|
"epoch": 0.41338582677165353,
|
||
|
|
"grad_norm": 1.22972672302474,
|
||
|
|
"learning_rate": 1.9603785006575015e-05,
|
||
|
|
"loss": 0.5378,
|
||
|
|
"mean_token_accuracy": 0.822184014134109,
|
||
|
|
"num_tokens": 45203043.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46295166015625,
|
||
|
|
"epoch": 0.41732283464566927,
|
||
|
|
"grad_norm": 1.2066718330624946,
|
||
|
|
"learning_rate": 1.9591584301187477e-05,
|
||
|
|
"loss": 0.5529,
|
||
|
|
"mean_token_accuracy": 0.8195339059457183,
|
||
|
|
"num_tokens": 45618924.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456146240234375,
|
||
|
|
"epoch": 0.421259842519685,
|
||
|
|
"grad_norm": 1.1758858200925488,
|
||
|
|
"learning_rate": 1.9579202498149562e-05,
|
||
|
|
"loss": 0.5466,
|
||
|
|
"mean_token_accuracy": 0.8180765705183148,
|
||
|
|
"num_tokens": 46051522.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4459228515625,
|
||
|
|
"epoch": 0.4251968503937008,
|
||
|
|
"grad_norm": 1.3596788756711078,
|
||
|
|
"learning_rate": 1.956663983124073e-05,
|
||
|
|
"loss": 0.5337,
|
||
|
|
"mean_token_accuracy": 0.8209798075258732,
|
||
|
|
"num_tokens": 46497049.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.447113037109375,
|
||
|
|
"epoch": 0.42913385826771655,
|
||
|
|
"grad_norm": 1.3368376077642434,
|
||
|
|
"learning_rate": 1.9553896537655317e-05,
|
||
|
|
"loss": 0.5195,
|
||
|
|
"mean_token_accuracy": 0.8260382236912847,
|
||
|
|
"num_tokens": 46930522.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4459228515625,
|
||
|
|
"epoch": 0.4330708661417323,
|
||
|
|
"grad_norm": 1.3620343015512997,
|
||
|
|
"learning_rate": 1.954097285799805e-05,
|
||
|
|
"loss": 0.5354,
|
||
|
|
"mean_token_accuracy": 0.8241122309118509,
|
||
|
|
"num_tokens": 47372874.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468780517578125,
|
||
|
|
"epoch": 0.43700787401574803,
|
||
|
|
"grad_norm": 1.2408939744606213,
|
||
|
|
"learning_rate": 1.9527869036279507e-05,
|
||
|
|
"loss": 0.5411,
|
||
|
|
"mean_token_accuracy": 0.8190175397321582,
|
||
|
|
"num_tokens": 47781013.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45635986328125,
|
||
|
|
"epoch": 0.4409448818897638,
|
||
|
|
"grad_norm": 1.1631375104825024,
|
||
|
|
"learning_rate": 1.951458531991151e-05,
|
||
|
|
"loss": 0.5488,
|
||
|
|
"mean_token_accuracy": 0.8172104032710195,
|
||
|
|
"num_tokens": 48226903.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.459869384765625,
|
||
|
|
"epoch": 0.4448818897637795,
|
||
|
|
"grad_norm": 2.1105607272357965,
|
||
|
|
"learning_rate": 1.9501121959702444e-05,
|
||
|
|
"loss": 0.5223,
|
||
|
|
"mean_token_accuracy": 0.8250879934057593,
|
||
|
|
"num_tokens": 48666939.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467315673828125,
|
||
|
|
"epoch": 0.44881889763779526,
|
||
|
|
"grad_norm": 1.2466596421057292,
|
||
|
|
"learning_rate": 1.9487479209852537e-05,
|
||
|
|
"loss": 0.5223,
|
||
|
|
"mean_token_accuracy": 0.8233406702056527,
|
||
|
|
"num_tokens": 49095261.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.462646484375,
|
||
|
|
"epoch": 0.452755905511811,
|
||
|
|
"grad_norm": 0.9969642868725791,
|
||
|
|
"learning_rate": 1.9473657327949055e-05,
|
||
|
|
"loss": 0.5118,
|
||
|
|
"mean_token_accuracy": 0.829724857583642,
|
||
|
|
"num_tokens": 49516172.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4603271484375,
|
||
|
|
"epoch": 0.4566929133858268,
|
||
|
|
"grad_norm": 1.1110224749457402,
|
||
|
|
"learning_rate": 1.9459656574961427e-05,
|
||
|
|
"loss": 0.5304,
|
||
|
|
"mean_token_accuracy": 0.8251465121284127,
|
||
|
|
"num_tokens": 49942159.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455718994140625,
|
||
|
|
"epoch": 0.46062992125984253,
|
||
|
|
"grad_norm": 1.1683751909145832,
|
||
|
|
"learning_rate": 1.9445477215236343e-05,
|
||
|
|
"loss": 0.5391,
|
||
|
|
"mean_token_accuracy": 0.8204762721434236,
|
||
|
|
"num_tokens": 50362851.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.459686279296875,
|
||
|
|
"epoch": 0.4645669291338583,
|
||
|
|
"grad_norm": 1.1316191678427665,
|
||
|
|
"learning_rate": 1.9431119516492725e-05,
|
||
|
|
"loss": 0.5307,
|
||
|
|
"mean_token_accuracy": 0.8236522153019905,
|
||
|
|
"num_tokens": 50797404.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45440673828125,
|
||
|
|
"epoch": 0.468503937007874,
|
||
|
|
"grad_norm": 1.068136681966582,
|
||
|
|
"learning_rate": 1.941658374981672e-05,
|
||
|
|
"loss": 0.5181,
|
||
|
|
"mean_token_accuracy": 0.8265996854752302,
|
||
|
|
"num_tokens": 51236009.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456207275390625,
|
||
|
|
"epoch": 0.47244094488188976,
|
||
|
|
"grad_norm": 1.2446449512193436,
|
||
|
|
"learning_rate": 1.9401870189656534e-05,
|
||
|
|
"loss": 0.5218,
|
||
|
|
"mean_token_accuracy": 0.8257655389606953,
|
||
|
|
"num_tokens": 51653792.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45611572265625,
|
||
|
|
"epoch": 0.4763779527559055,
|
||
|
|
"grad_norm": 1.1566053130388678,
|
||
|
|
"learning_rate": 1.9386979113817283e-05,
|
||
|
|
"loss": 0.5246,
|
||
|
|
"mean_token_accuracy": 0.8237393777817488,
|
||
|
|
"num_tokens": 52081608.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46746826171875,
|
||
|
|
"epoch": 0.48031496062992124,
|
||
|
|
"grad_norm": 1.0908915988571897,
|
||
|
|
"learning_rate": 1.937191080345574e-05,
|
||
|
|
"loss": 0.5131,
|
||
|
|
"mean_token_accuracy": 0.8298475751653314,
|
||
|
|
"num_tokens": 52491118.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.47027587890625,
|
||
|
|
"epoch": 0.484251968503937,
|
||
|
|
"grad_norm": 1.1892400483856065,
|
||
|
|
"learning_rate": 1.9356665543075013e-05,
|
||
|
|
"loss": 0.5208,
|
||
|
|
"mean_token_accuracy": 0.8260363638401031,
|
||
|
|
"num_tokens": 52914582.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.458648681640625,
|
||
|
|
"epoch": 0.4881889763779528,
|
||
|
|
"grad_norm": 1.1290959732866495,
|
||
|
|
"learning_rate": 1.934124362051919e-05,
|
||
|
|
"loss": 0.5392,
|
||
|
|
"mean_token_accuracy": 0.8196229068562388,
|
||
|
|
"num_tokens": 53371925.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.482818603515625,
|
||
|
|
"epoch": 0.4921259842519685,
|
||
|
|
"grad_norm": 1.2157854071703211,
|
||
|
|
"learning_rate": 1.9325645326967904e-05,
|
||
|
|
"loss": 0.5451,
|
||
|
|
"mean_token_accuracy": 0.8218522099778056,
|
||
|
|
"num_tokens": 53774738.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453887939453125,
|
||
|
|
"epoch": 0.49606299212598426,
|
||
|
|
"grad_norm": 1.1985989720544075,
|
||
|
|
"learning_rate": 1.9309870956930818e-05,
|
||
|
|
"loss": 0.5239,
|
||
|
|
"mean_token_accuracy": 0.8237557969987392,
|
||
|
|
"num_tokens": 54223418.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4586181640625,
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 1.104716347359038,
|
||
|
|
"learning_rate": 1.9293920808242084e-05,
|
||
|
|
"loss": 0.5216,
|
||
|
|
"mean_token_accuracy": 0.8251854004338384,
|
||
|
|
"num_tokens": 54655762.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4671630859375,
|
||
|
|
"epoch": 0.5039370078740157,
|
||
|
|
"grad_norm": 1.1779574568986346,
|
||
|
|
"learning_rate": 1.927779518205471e-05,
|
||
|
|
"loss": 0.5229,
|
||
|
|
"mean_token_accuracy": 0.8235677415505052,
|
||
|
|
"num_tokens": 55081121.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4610595703125,
|
||
|
|
"epoch": 0.5078740157480315,
|
||
|
|
"grad_norm": 1.211241476354621,
|
||
|
|
"learning_rate": 1.9261494382834866e-05,
|
||
|
|
"loss": 0.5206,
|
||
|
|
"mean_token_accuracy": 0.8260276168584824,
|
||
|
|
"num_tokens": 55501868.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455474853515625,
|
||
|
|
"epoch": 0.5118110236220472,
|
||
|
|
"grad_norm": 1.2106744968057903,
|
||
|
|
"learning_rate": 1.924501871835616e-05,
|
||
|
|
"loss": 0.5363,
|
||
|
|
"mean_token_accuracy": 0.8244349677115679,
|
||
|
|
"num_tokens": 55942607.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45953369140625,
|
||
|
|
"epoch": 0.515748031496063,
|
||
|
|
"grad_norm": 1.1675102322000497,
|
||
|
|
"learning_rate": 1.9228368499693805e-05,
|
||
|
|
"loss": 0.5182,
|
||
|
|
"mean_token_accuracy": 0.825249838642776,
|
||
|
|
"num_tokens": 56371760.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.458892822265625,
|
||
|
|
"epoch": 0.5196850393700787,
|
||
|
|
"grad_norm": 1.0672534247625016,
|
||
|
|
"learning_rate": 1.9211544041218752e-05,
|
||
|
|
"loss": 0.5127,
|
||
|
|
"mean_token_accuracy": 0.8282053135335445,
|
||
|
|
"num_tokens": 56805459.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.461151123046875,
|
||
|
|
"epoch": 0.5236220472440944,
|
||
|
|
"grad_norm": 1.0870642318591395,
|
||
|
|
"learning_rate": 1.9194545660591753e-05,
|
||
|
|
"loss": 0.5412,
|
||
|
|
"mean_token_accuracy": 0.8184263566508889,
|
||
|
|
"num_tokens": 57252250.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.454803466796875,
|
||
|
|
"epoch": 0.5275590551181102,
|
||
|
|
"grad_norm": 0.9981587768631067,
|
||
|
|
"learning_rate": 1.917737367875736e-05,
|
||
|
|
"loss": 0.5303,
|
||
|
|
"mean_token_accuracy": 0.8198808785527945,
|
||
|
|
"num_tokens": 57699541.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.462493896484375,
|
||
|
|
"epoch": 0.531496062992126,
|
||
|
|
"grad_norm": 1.1368326435130822,
|
||
|
|
"learning_rate": 1.916002841993789e-05,
|
||
|
|
"loss": 0.5344,
|
||
|
|
"mean_token_accuracy": 0.8233586028218269,
|
||
|
|
"num_tokens": 58126669.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46136474609375,
|
||
|
|
"epoch": 0.5354330708661418,
|
||
|
|
"grad_norm": 1.022490800089311,
|
||
|
|
"learning_rate": 1.9142510211627264e-05,
|
||
|
|
"loss": 0.5189,
|
||
|
|
"mean_token_accuracy": 0.8275930220261216,
|
||
|
|
"num_tokens": 58554779.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.471771240234375,
|
||
|
|
"epoch": 0.5393700787401575,
|
||
|
|
"grad_norm": 1.0787531090577647,
|
||
|
|
"learning_rate": 1.912481938458485e-05,
|
||
|
|
"loss": 0.5099,
|
||
|
|
"mean_token_accuracy": 0.8287343252450228,
|
||
|
|
"num_tokens": 58973692.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4613037109375,
|
||
|
|
"epoch": 0.5433070866141733,
|
||
|
|
"grad_norm": 0.9846465542742977,
|
||
|
|
"learning_rate": 1.9106956272829212e-05,
|
||
|
|
"loss": 0.5308,
|
||
|
|
"mean_token_accuracy": 0.8231041831895709,
|
||
|
|
"num_tokens": 59427304.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.465423583984375,
|
||
|
|
"epoch": 0.547244094488189,
|
||
|
|
"grad_norm": 1.020134766573174,
|
||
|
|
"learning_rate": 1.9088921213631803e-05,
|
||
|
|
"loss": 0.5177,
|
||
|
|
"mean_token_accuracy": 0.8275422500446439,
|
||
|
|
"num_tokens": 59859256.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453399658203125,
|
||
|
|
"epoch": 0.5511811023622047,
|
||
|
|
"grad_norm": 1.07451870093493,
|
||
|
|
"learning_rate": 1.9070714547510593e-05,
|
||
|
|
"loss": 0.5157,
|
||
|
|
"mean_token_accuracy": 0.8267948348075151,
|
||
|
|
"num_tokens": 60291249.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.467132568359375,
|
||
|
|
"epoch": 0.5551181102362205,
|
||
|
|
"grad_norm": 1.0197584285113064,
|
||
|
|
"learning_rate": 1.9052336618223655e-05,
|
||
|
|
"loss": 0.4942,
|
||
|
|
"mean_token_accuracy": 0.8316316213458776,
|
||
|
|
"num_tokens": 60700236.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455078125,
|
||
|
|
"epoch": 0.5590551181102362,
|
||
|
|
"grad_norm": 0.9730024523474553,
|
||
|
|
"learning_rate": 1.9033787772762647e-05,
|
||
|
|
"loss": 0.5184,
|
||
|
|
"mean_token_accuracy": 0.8264486761763692,
|
||
|
|
"num_tokens": 61135518.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.458465576171875,
|
||
|
|
"epoch": 0.562992125984252,
|
||
|
|
"grad_norm": 1.4869151975995116,
|
||
|
|
"learning_rate": 1.9015068361346284e-05,
|
||
|
|
"loss": 0.5108,
|
||
|
|
"mean_token_accuracy": 0.8289940198883414,
|
||
|
|
"num_tokens": 61551878.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.444732666015625,
|
||
|
|
"epoch": 0.5669291338582677,
|
||
|
|
"grad_norm": 1.066403284862127,
|
||
|
|
"learning_rate": 1.8996178737413724e-05,
|
||
|
|
"loss": 0.5103,
|
||
|
|
"mean_token_accuracy": 0.8301606168970466,
|
||
|
|
"num_tokens": 62004999.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.463714599609375,
|
||
|
|
"epoch": 0.5708661417322834,
|
||
|
|
"grad_norm": 0.9982577993437047,
|
||
|
|
"learning_rate": 1.8977119257617878e-05,
|
||
|
|
"loss": 0.5149,
|
||
|
|
"mean_token_accuracy": 0.8276943136006594,
|
||
|
|
"num_tokens": 62421515.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45458984375,
|
||
|
|
"epoch": 0.5748031496062992,
|
||
|
|
"grad_norm": 1.0544064260214807,
|
||
|
|
"learning_rate": 1.8957890281818693e-05,
|
||
|
|
"loss": 0.5116,
|
||
|
|
"mean_token_accuracy": 0.8283577999100089,
|
||
|
|
"num_tokens": 62842401.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.464019775390625,
|
||
|
|
"epoch": 0.5787401574803149,
|
||
|
|
"grad_norm": 1.0336300313219318,
|
||
|
|
"learning_rate": 1.893849217307635e-05,
|
||
|
|
"loss": 0.5168,
|
||
|
|
"mean_token_accuracy": 0.8278819629922509,
|
||
|
|
"num_tokens": 63269680.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4522705078125,
|
||
|
|
"epoch": 0.5826771653543307,
|
||
|
|
"grad_norm": 1.090759657848,
|
||
|
|
"learning_rate": 1.8918925297644418e-05,
|
||
|
|
"loss": 0.5079,
|
||
|
|
"mean_token_accuracy": 0.8282352862879634,
|
||
|
|
"num_tokens": 63710872.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455718994140625,
|
||
|
|
"epoch": 0.5866141732283464,
|
||
|
|
"grad_norm": 1.1258346244217974,
|
||
|
|
"learning_rate": 1.889919002496291e-05,
|
||
|
|
"loss": 0.516,
|
||
|
|
"mean_token_accuracy": 0.8259612387046218,
|
||
|
|
"num_tokens": 64150024.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46331787109375,
|
||
|
|
"epoch": 0.5905511811023622,
|
||
|
|
"grad_norm": 0.99965957473205,
|
||
|
|
"learning_rate": 1.8879286727651357e-05,
|
||
|
|
"loss": 0.5257,
|
||
|
|
"mean_token_accuracy": 0.8258456196635962,
|
||
|
|
"num_tokens": 64581389.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46087646484375,
|
||
|
|
"epoch": 0.594488188976378,
|
||
|
|
"grad_norm": 1.050563121756213,
|
||
|
|
"learning_rate": 1.8859215781501727e-05,
|
||
|
|
"loss": 0.5092,
|
||
|
|
"mean_token_accuracy": 0.8281868807971478,
|
||
|
|
"num_tokens": 65008879.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.46453857421875,
|
||
|
|
"epoch": 0.5984251968503937,
|
||
|
|
"grad_norm": 0.9756638749951236,
|
||
|
|
"learning_rate": 1.8838977565471343e-05,
|
||
|
|
"loss": 0.5102,
|
||
|
|
"mean_token_accuracy": 0.8313626917079091,
|
||
|
|
"num_tokens": 65418174.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4627685546875,
|
||
|
|
"epoch": 0.6023622047244095,
|
||
|
|
"grad_norm": 0.9933688064734119,
|
||
|
|
"learning_rate": 1.881857246167575e-05,
|
||
|
|
"loss": 0.5034,
|
||
|
|
"mean_token_accuracy": 0.8302843710407615,
|
||
|
|
"num_tokens": 65832965.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.461334228515625,
|
||
|
|
"epoch": 0.6062992125984252,
|
||
|
|
"grad_norm": 0.9166610309560314,
|
||
|
|
"learning_rate": 1.8798000855381472e-05,
|
||
|
|
"loss": 0.4962,
|
||
|
|
"mean_token_accuracy": 0.8299053413793445,
|
||
|
|
"num_tokens": 66238636.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45989990234375,
|
||
|
|
"epoch": 0.610236220472441,
|
||
|
|
"grad_norm": 1.0171788498896126,
|
||
|
|
"learning_rate": 1.8777263134998745e-05,
|
||
|
|
"loss": 0.5026,
|
||
|
|
"mean_token_accuracy": 0.8297234000638127,
|
||
|
|
"num_tokens": 66650377.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456085205078125,
|
||
|
|
"epoch": 0.6141732283464567,
|
||
|
|
"grad_norm": 0.9317200752741837,
|
||
|
|
"learning_rate": 1.8756359692074192e-05,
|
||
|
|
"loss": 0.5201,
|
||
|
|
"mean_token_accuracy": 0.8240464190021157,
|
||
|
|
"num_tokens": 67099891.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.455780029296875,
|
||
|
|
"epoch": 0.6181102362204725,
|
||
|
|
"grad_norm": 1.0545205323300713,
|
||
|
|
"learning_rate": 1.873529092128343e-05,
|
||
|
|
"loss": 0.5135,
|
||
|
|
"mean_token_accuracy": 0.8289516335353255,
|
||
|
|
"num_tokens": 67526947.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45404052734375,
|
||
|
|
"epoch": 0.6220472440944882,
|
||
|
|
"grad_norm": 0.9464181426572896,
|
||
|
|
"learning_rate": 1.8714057220423604e-05,
|
||
|
|
"loss": 0.4911,
|
||
|
|
"mean_token_accuracy": 0.8318861154839396,
|
||
|
|
"num_tokens": 67941621.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45538330078125,
|
||
|
|
"epoch": 0.6259842519685039,
|
||
|
|
"grad_norm": 0.8719131150322863,
|
||
|
|
"learning_rate": 1.8692658990405887e-05,
|
||
|
|
"loss": 0.5166,
|
||
|
|
"mean_token_accuracy": 0.8262722417712212,
|
||
|
|
"num_tokens": 68373473.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452972412109375,
|
||
|
|
"epoch": 0.6299212598425197,
|
||
|
|
"grad_norm": 0.9697678103322591,
|
||
|
|
"learning_rate": 1.8671096635247914e-05,
|
||
|
|
"loss": 0.5089,
|
||
|
|
"mean_token_accuracy": 0.8285679388791323,
|
||
|
|
"num_tokens": 68787973.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4500732421875,
|
||
|
|
"epoch": 0.6338582677165354,
|
||
|
|
"grad_norm": 0.9035700093333056,
|
||
|
|
"learning_rate": 1.8649370562066147e-05,
|
||
|
|
"loss": 0.4953,
|
||
|
|
"mean_token_accuracy": 0.8336192965507507,
|
||
|
|
"num_tokens": 69220309.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.456329345703125,
|
||
|
|
"epoch": 0.6377952755905512,
|
||
|
|
"grad_norm": 0.8949309294710832,
|
||
|
|
"learning_rate": 1.8627481181068185e-05,
|
||
|
|
"loss": 0.4982,
|
||
|
|
"mean_token_accuracy": 0.8325092112645507,
|
||
|
|
"num_tokens": 69626711.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4501953125,
|
||
|
|
"epoch": 0.6417322834645669,
|
||
|
|
"grad_norm": 0.9662881010100235,
|
||
|
|
"learning_rate": 1.860542890554503e-05,
|
||
|
|
"loss": 0.4961,
|
||
|
|
"mean_token_accuracy": 0.8319829516112804,
|
||
|
|
"num_tokens": 70049083.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452392578125,
|
||
|
|
"epoch": 0.6456692913385826,
|
||
|
|
"grad_norm": 0.9112855567573753,
|
||
|
|
"learning_rate": 1.8583214151863277e-05,
|
||
|
|
"loss": 0.5062,
|
||
|
|
"mean_token_accuracy": 0.8289196388795972,
|
||
|
|
"num_tokens": 70467285.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44775390625,
|
||
|
|
"epoch": 0.6496062992125984,
|
||
|
|
"grad_norm": 0.9403307323488074,
|
||
|
|
"learning_rate": 1.856083733945725e-05,
|
||
|
|
"loss": 0.5192,
|
||
|
|
"mean_token_accuracy": 0.8249025819823146,
|
||
|
|
"num_tokens": 70915085.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44525146484375,
|
||
|
|
"epoch": 0.6535433070866141,
|
||
|
|
"grad_norm": 0.8985452902281895,
|
||
|
|
"learning_rate": 1.853829889082109e-05,
|
||
|
|
"loss": 0.5034,
|
||
|
|
"mean_token_accuracy": 0.830441677942872,
|
||
|
|
"num_tokens": 71339757.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.435821533203125,
|
||
|
|
"epoch": 0.65748031496063,
|
||
|
|
"grad_norm": 0.8822733945012894,
|
||
|
|
"learning_rate": 1.851559923150077e-05,
|
||
|
|
"loss": 0.5137,
|
||
|
|
"mean_token_accuracy": 0.8269774587824941,
|
||
|
|
"num_tokens": 71798915.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452911376953125,
|
||
|
|
"epoch": 0.6614173228346457,
|
||
|
|
"grad_norm": 0.8920203408684533,
|
||
|
|
"learning_rate": 1.8492738790086066e-05,
|
||
|
|
"loss": 0.509,
|
||
|
|
"mean_token_accuracy": 0.8303144676610827,
|
||
|
|
"num_tokens": 72213283.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4490966796875,
|
||
|
|
"epoch": 0.6653543307086615,
|
||
|
|
"grad_norm": 0.9204794974635663,
|
||
|
|
"learning_rate": 1.8469717998202464e-05,
|
||
|
|
"loss": 0.5014,
|
||
|
|
"mean_token_accuracy": 0.8291385043412447,
|
||
|
|
"num_tokens": 72640357.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44903564453125,
|
||
|
|
"epoch": 0.6692913385826772,
|
||
|
|
"grad_norm": 0.8803703609637004,
|
||
|
|
"learning_rate": 1.844653729050301e-05,
|
||
|
|
"loss": 0.5083,
|
||
|
|
"mean_token_accuracy": 0.8296322766691446,
|
||
|
|
"num_tokens": 73071318.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44970703125,
|
||
|
|
"epoch": 0.6732283464566929,
|
||
|
|
"grad_norm": 0.9449900174934937,
|
||
|
|
"learning_rate": 1.8423197104660094e-05,
|
||
|
|
"loss": 0.4944,
|
||
|
|
"mean_token_accuracy": 0.8333448059856892,
|
||
|
|
"num_tokens": 73496580.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45001220703125,
|
||
|
|
"epoch": 0.6771653543307087,
|
||
|
|
"grad_norm": 0.9485819451800577,
|
||
|
|
"learning_rate": 1.8399697881357214e-05,
|
||
|
|
"loss": 0.5093,
|
||
|
|
"mean_token_accuracy": 0.8251696135848761,
|
||
|
|
"num_tokens": 73911332.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4556884765625,
|
||
|
|
"epoch": 0.6811023622047244,
|
||
|
|
"grad_norm": 0.8788533882995756,
|
||
|
|
"learning_rate": 1.8376040064280616e-05,
|
||
|
|
"loss": 0.5064,
|
||
|
|
"mean_token_accuracy": 0.8283712277188897,
|
||
|
|
"num_tokens": 74343121.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.446075439453125,
|
||
|
|
"epoch": 0.6850393700787402,
|
||
|
|
"grad_norm": 1.1568959205226408,
|
||
|
|
"learning_rate": 1.835222410011096e-05,
|
||
|
|
"loss": 0.4985,
|
||
|
|
"mean_token_accuracy": 0.8313534203916788,
|
||
|
|
"num_tokens": 74767266.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452178955078125,
|
||
|
|
"epoch": 0.6889763779527559,
|
||
|
|
"grad_norm": 1.025805343277443,
|
||
|
|
"learning_rate": 1.8328250438514837e-05,
|
||
|
|
"loss": 0.5061,
|
||
|
|
"mean_token_accuracy": 0.8259405111894011,
|
||
|
|
"num_tokens": 75183236.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45050048828125,
|
||
|
|
"epoch": 0.6929133858267716,
|
||
|
|
"grad_norm": 0.9573441596688619,
|
||
|
|
"learning_rate": 1.8304119532136328e-05,
|
||
|
|
"loss": 0.5113,
|
||
|
|
"mean_token_accuracy": 0.8278427310287952,
|
||
|
|
"num_tokens": 75607777.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453765869140625,
|
||
|
|
"epoch": 0.6968503937007874,
|
||
|
|
"grad_norm": 1.002212658374949,
|
||
|
|
"learning_rate": 1.8279831836588427e-05,
|
||
|
|
"loss": 0.5021,
|
||
|
|
"mean_token_accuracy": 0.8309840820729733,
|
||
|
|
"num_tokens": 76033909.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45233154296875,
|
||
|
|
"epoch": 0.7007874015748031,
|
||
|
|
"grad_norm": 1.0753125904594512,
|
||
|
|
"learning_rate": 1.8255387810444447e-05,
|
||
|
|
"loss": 0.4992,
|
||
|
|
"mean_token_accuracy": 0.832436814904213,
|
||
|
|
"num_tokens": 76455698.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44952392578125,
|
||
|
|
"epoch": 0.7047244094488189,
|
||
|
|
"grad_norm": 0.8922465191801553,
|
||
|
|
"learning_rate": 1.8230787915229358e-05,
|
||
|
|
"loss": 0.505,
|
||
|
|
"mean_token_accuracy": 0.8306124126538634,
|
||
|
|
"num_tokens": 76889182.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441192626953125,
|
||
|
|
"epoch": 0.7086614173228346,
|
||
|
|
"grad_norm": 1.0851600035028615,
|
||
|
|
"learning_rate": 1.8206032615411092e-05,
|
||
|
|
"loss": 0.5052,
|
||
|
|
"mean_token_accuracy": 0.8298691343516111,
|
||
|
|
"num_tokens": 77329833.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44793701171875,
|
||
|
|
"epoch": 0.7125984251968503,
|
||
|
|
"grad_norm": 0.9640089148569666,
|
||
|
|
"learning_rate": 1.818112237839174e-05,
|
||
|
|
"loss": 0.4972,
|
||
|
|
"mean_token_accuracy": 0.8315172418951988,
|
||
|
|
"num_tokens": 77743805.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45074462890625,
|
||
|
|
"epoch": 0.7165354330708661,
|
||
|
|
"grad_norm": 0.8688883877508005,
|
||
|
|
"learning_rate": 1.8156057674498756e-05,
|
||
|
|
"loss": 0.497,
|
||
|
|
"mean_token_accuracy": 0.8296341849491,
|
||
|
|
"num_tokens": 78174355.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437103271484375,
|
||
|
|
"epoch": 0.7204724409448819,
|
||
|
|
"grad_norm": 0.8482457279373461,
|
||
|
|
"learning_rate": 1.8130838976976058e-05,
|
||
|
|
"loss": 0.4947,
|
||
|
|
"mean_token_accuracy": 0.8344184467568994,
|
||
|
|
"num_tokens": 78613242.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44915771484375,
|
||
|
|
"epoch": 0.7244094488188977,
|
||
|
|
"grad_norm": 0.8888701349814255,
|
||
|
|
"learning_rate": 1.810546676197511e-05,
|
||
|
|
"loss": 0.4879,
|
||
|
|
"mean_token_accuracy": 0.8355774069204926,
|
||
|
|
"num_tokens": 79037611.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449859619140625,
|
||
|
|
"epoch": 0.7283464566929134,
|
||
|
|
"grad_norm": 0.8760275198824425,
|
||
|
|
"learning_rate": 1.807994150854592e-05,
|
||
|
|
"loss": 0.5002,
|
||
|
|
"mean_token_accuracy": 0.8316125152632594,
|
||
|
|
"num_tokens": 79453628.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44781494140625,
|
||
|
|
"epoch": 0.7322834645669292,
|
||
|
|
"grad_norm": 1.2974591275029912,
|
||
|
|
"learning_rate": 1.805426369862799e-05,
|
||
|
|
"loss": 0.5035,
|
||
|
|
"mean_token_accuracy": 0.8302051173523068,
|
||
|
|
"num_tokens": 79888974.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4373779296875,
|
||
|
|
"epoch": 0.7362204724409449,
|
||
|
|
"grad_norm": 0.8929755000611298,
|
||
|
|
"learning_rate": 1.8028433817041237e-05,
|
||
|
|
"loss": 0.4822,
|
||
|
|
"mean_token_accuracy": 0.8354063536971807,
|
||
|
|
"num_tokens": 80321324.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443572998046875,
|
||
|
|
"epoch": 0.7401574803149606,
|
||
|
|
"grad_norm": 0.8329491046833114,
|
||
|
|
"learning_rate": 1.8002452351476817e-05,
|
||
|
|
"loss": 0.5047,
|
||
|
|
"mean_token_accuracy": 0.8291633054614067,
|
||
|
|
"num_tokens": 80766297.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445404052734375,
|
||
|
|
"epoch": 0.7440944881889764,
|
||
|
|
"grad_norm": 0.8665441369913496,
|
||
|
|
"learning_rate": 1.7976319792487933e-05,
|
||
|
|
"loss": 0.4915,
|
||
|
|
"mean_token_accuracy": 0.8326784670352936,
|
||
|
|
"num_tokens": 81192416.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442962646484375,
|
||
|
|
"epoch": 0.7480314960629921,
|
||
|
|
"grad_norm": 0.8336203122290462,
|
||
|
|
"learning_rate": 1.7950036633480557e-05,
|
||
|
|
"loss": 0.4895,
|
||
|
|
"mean_token_accuracy": 0.8337859380990267,
|
||
|
|
"num_tokens": 81633131.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438995361328125,
|
||
|
|
"epoch": 0.7519685039370079,
|
||
|
|
"grad_norm": 0.8560354139325418,
|
||
|
|
"learning_rate": 1.7923603370704136e-05,
|
||
|
|
"loss": 0.4849,
|
||
|
|
"mean_token_accuracy": 0.8339350931346416,
|
||
|
|
"num_tokens": 82073657.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43817138671875,
|
||
|
|
"epoch": 0.7559055118110236,
|
||
|
|
"grad_norm": 0.8646497974459243,
|
||
|
|
"learning_rate": 1.7897020503242192e-05,
|
||
|
|
"loss": 0.4829,
|
||
|
|
"mean_token_accuracy": 0.8361313687637448,
|
||
|
|
"num_tokens": 82495559.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44427490234375,
|
||
|
|
"epoch": 0.7598425196850394,
|
||
|
|
"grad_norm": 0.8131289484833817,
|
||
|
|
"learning_rate": 1.787028853300294e-05,
|
||
|
|
"loss": 0.4807,
|
||
|
|
"mean_token_accuracy": 0.8349549919366837,
|
||
|
|
"num_tokens": 82909007.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4383544921875,
|
||
|
|
"epoch": 0.7637795275590551,
|
||
|
|
"grad_norm": 0.7972139228607141,
|
||
|
|
"learning_rate": 1.7843407964709773e-05,
|
||
|
|
"loss": 0.482,
|
||
|
|
"mean_token_accuracy": 0.8354136543348432,
|
||
|
|
"num_tokens": 83334137.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438873291015625,
|
||
|
|
"epoch": 0.7677165354330708,
|
||
|
|
"grad_norm": 0.882340235786015,
|
||
|
|
"learning_rate": 1.7816379305891743e-05,
|
||
|
|
"loss": 0.4882,
|
||
|
|
"mean_token_accuracy": 0.8349952660501003,
|
||
|
|
"num_tokens": 83761623.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4444580078125,
|
||
|
|
"epoch": 0.7716535433070866,
|
||
|
|
"grad_norm": 0.8653917946283164,
|
||
|
|
"learning_rate": 1.7789203066874e-05,
|
||
|
|
"loss": 0.4914,
|
||
|
|
"mean_token_accuracy": 0.8350548483431339,
|
||
|
|
"num_tokens": 84184837.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44403076171875,
|
||
|
|
"epoch": 0.7755905511811023,
|
||
|
|
"grad_norm": 0.9248212539704527,
|
||
|
|
"learning_rate": 1.7761879760768123e-05,
|
||
|
|
"loss": 0.4928,
|
||
|
|
"mean_token_accuracy": 0.8332647895440459,
|
||
|
|
"num_tokens": 84627232.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43585205078125,
|
||
|
|
"epoch": 0.7795275590551181,
|
||
|
|
"grad_norm": 0.8874785042910702,
|
||
|
|
"learning_rate": 1.7734409903462454e-05,
|
||
|
|
"loss": 0.4851,
|
||
|
|
"mean_token_accuracy": 0.8350456738844514,
|
||
|
|
"num_tokens": 85067016.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4476318359375,
|
||
|
|
"epoch": 0.7834645669291339,
|
||
|
|
"grad_norm": 0.8774288891360084,
|
||
|
|
"learning_rate": 1.7706794013612367e-05,
|
||
|
|
"loss": 0.497,
|
||
|
|
"mean_token_accuracy": 0.8308952897787094,
|
||
|
|
"num_tokens": 85474661.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440887451171875,
|
||
|
|
"epoch": 0.7874015748031497,
|
||
|
|
"grad_norm": 0.950578748327104,
|
||
|
|
"learning_rate": 1.7679032612630432e-05,
|
||
|
|
"loss": 0.4872,
|
||
|
|
"mean_token_accuracy": 0.8345352541655302,
|
||
|
|
"num_tokens": 85907009.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441009521484375,
|
||
|
|
"epoch": 0.7913385826771654,
|
||
|
|
"grad_norm": 0.8638996030741382,
|
||
|
|
"learning_rate": 1.7651126224676616e-05,
|
||
|
|
"loss": 0.4971,
|
||
|
|
"mean_token_accuracy": 0.8309849118813872,
|
||
|
|
"num_tokens": 86334007.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4449462890625,
|
||
|
|
"epoch": 0.7952755905511811,
|
||
|
|
"grad_norm": 0.9130367054606549,
|
||
|
|
"learning_rate": 1.7623075376648374e-05,
|
||
|
|
"loss": 0.4906,
|
||
|
|
"mean_token_accuracy": 0.8348276494070888,
|
||
|
|
"num_tokens": 86756048.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4334716796875,
|
||
|
|
"epoch": 0.7992125984251969,
|
||
|
|
"grad_norm": 0.8490365529225518,
|
||
|
|
"learning_rate": 1.7594880598170688e-05,
|
||
|
|
"loss": 0.4932,
|
||
|
|
"mean_token_accuracy": 0.8335457816720009,
|
||
|
|
"num_tokens": 87189461.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440887451171875,
|
||
|
|
"epoch": 0.8031496062992126,
|
||
|
|
"grad_norm": 0.8512282062983372,
|
||
|
|
"learning_rate": 1.756654242158607e-05,
|
||
|
|
"loss": 0.4802,
|
||
|
|
"mean_token_accuracy": 0.8350167060270905,
|
||
|
|
"num_tokens": 87612667.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.446807861328125,
|
||
|
|
"epoch": 0.8070866141732284,
|
||
|
|
"grad_norm": 0.8346172013248041,
|
||
|
|
"learning_rate": 1.7538061381944524e-05,
|
||
|
|
"loss": 0.4828,
|
||
|
|
"mean_token_accuracy": 0.8335348330438137,
|
||
|
|
"num_tokens": 88032722.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.436798095703125,
|
||
|
|
"epoch": 0.8110236220472441,
|
||
|
|
"grad_norm": 0.9406039912781453,
|
||
|
|
"learning_rate": 1.7509438016993435e-05,
|
||
|
|
"loss": 0.5043,
|
||
|
|
"mean_token_accuracy": 0.828425613231957,
|
||
|
|
"num_tokens": 88455185.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443450927734375,
|
||
|
|
"epoch": 0.8149606299212598,
|
||
|
|
"grad_norm": 0.8878870161898881,
|
||
|
|
"learning_rate": 1.748067286716741e-05,
|
||
|
|
"loss": 0.5053,
|
||
|
|
"mean_token_accuracy": 0.8270987952128053,
|
||
|
|
"num_tokens": 88887403.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43701171875,
|
||
|
|
"epoch": 0.8188976377952756,
|
||
|
|
"grad_norm": 0.833846984688481,
|
||
|
|
"learning_rate": 1.745176647557809e-05,
|
||
|
|
"loss": 0.4806,
|
||
|
|
"mean_token_accuracy": 0.8361224737018347,
|
||
|
|
"num_tokens": 89321230.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.434295654296875,
|
||
|
|
"epoch": 0.8228346456692913,
|
||
|
|
"grad_norm": 0.8800807090551229,
|
||
|
|
"learning_rate": 1.7422719388003882e-05,
|
||
|
|
"loss": 0.4851,
|
||
|
|
"mean_token_accuracy": 0.834100566804409,
|
||
|
|
"num_tokens": 89752168.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4471435546875,
|
||
|
|
"epoch": 0.8267716535433071,
|
||
|
|
"grad_norm": 0.8536884570400775,
|
||
|
|
"learning_rate": 1.739353215287965e-05,
|
||
|
|
"loss": 0.5013,
|
||
|
|
"mean_token_accuracy": 0.8307216819375753,
|
||
|
|
"num_tokens": 90185259.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43634033203125,
|
||
|
|
"epoch": 0.8307086614173228,
|
||
|
|
"grad_norm": 0.838204501619093,
|
||
|
|
"learning_rate": 1.7364205321286393e-05,
|
||
|
|
"loss": 0.4741,
|
||
|
|
"mean_token_accuracy": 0.8391950000077486,
|
||
|
|
"num_tokens": 90618555.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438018798828125,
|
||
|
|
"epoch": 0.8346456692913385,
|
||
|
|
"grad_norm": 0.8339181556557149,
|
||
|
|
"learning_rate": 1.7334739446940785e-05,
|
||
|
|
"loss": 0.487,
|
||
|
|
"mean_token_accuracy": 0.8343816567212343,
|
||
|
|
"num_tokens": 91070952.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.444671630859375,
|
||
|
|
"epoch": 0.8385826771653543,
|
||
|
|
"grad_norm": 0.9454821175064054,
|
||
|
|
"learning_rate": 1.730513508618477e-05,
|
||
|
|
"loss": 0.4879,
|
||
|
|
"mean_token_accuracy": 0.8355796793475747,
|
||
|
|
"num_tokens": 91492780.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443389892578125,
|
||
|
|
"epoch": 0.84251968503937,
|
||
|
|
"grad_norm": 0.9537363792825977,
|
||
|
|
"learning_rate": 1.7275392797975034e-05,
|
||
|
|
"loss": 0.4788,
|
||
|
|
"mean_token_accuracy": 0.8368481360375881,
|
||
|
|
"num_tokens": 91928189.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.439117431640625,
|
||
|
|
"epoch": 0.8464566929133859,
|
||
|
|
"grad_norm": 0.8170845597708071,
|
||
|
|
"learning_rate": 1.7245513143872458e-05,
|
||
|
|
"loss": 0.4796,
|
||
|
|
"mean_token_accuracy": 0.8352800803259015,
|
||
|
|
"num_tokens": 92380202.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.438140869140625,
|
||
|
|
"epoch": 0.8503937007874016,
|
||
|
|
"grad_norm": 0.8684828468772561,
|
||
|
|
"learning_rate": 1.7215496688031504e-05,
|
||
|
|
"loss": 0.4824,
|
||
|
|
"mean_token_accuracy": 0.8344074506312609,
|
||
|
|
"num_tokens": 92821942.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4383544921875,
|
||
|
|
"epoch": 0.8543307086614174,
|
||
|
|
"grad_norm": 0.9016894253389791,
|
||
|
|
"learning_rate": 1.718534399718959e-05,
|
||
|
|
"loss": 0.4843,
|
||
|
|
"mean_token_accuracy": 0.8332484466955066,
|
||
|
|
"num_tokens": 93256559.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448089599609375,
|
||
|
|
"epoch": 0.8582677165354331,
|
||
|
|
"grad_norm": 0.7926466666351812,
|
||
|
|
"learning_rate": 1.7155055640656353e-05,
|
||
|
|
"loss": 0.4779,
|
||
|
|
"mean_token_accuracy": 0.8372692186385393,
|
||
|
|
"num_tokens": 93678457.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453216552734375,
|
||
|
|
"epoch": 0.8622047244094488,
|
||
|
|
"grad_norm": 0.7983330308845654,
|
||
|
|
"learning_rate": 1.7124632190302936e-05,
|
||
|
|
"loss": 0.4632,
|
||
|
|
"mean_token_accuracy": 0.8423045501112938,
|
||
|
|
"num_tokens": 94088744.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445556640625,
|
||
|
|
"epoch": 0.8661417322834646,
|
||
|
|
"grad_norm": 0.9230403722775413,
|
||
|
|
"learning_rate": 1.709407422055116e-05,
|
||
|
|
"loss": 0.4675,
|
||
|
|
"mean_token_accuracy": 0.8398523181676865,
|
||
|
|
"num_tokens": 94509523.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449920654296875,
|
||
|
|
"epoch": 0.8700787401574803,
|
||
|
|
"grad_norm": 0.9307744641259067,
|
||
|
|
"learning_rate": 1.70633823083627e-05,
|
||
|
|
"loss": 0.4788,
|
||
|
|
"mean_token_accuracy": 0.8371898429468274,
|
||
|
|
"num_tokens": 94939487.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45404052734375,
|
||
|
|
"epoch": 0.8740157480314961,
|
||
|
|
"grad_norm": 0.8081498704504936,
|
||
|
|
"learning_rate": 1.7032557033228184e-05,
|
||
|
|
"loss": 0.4831,
|
||
|
|
"mean_token_accuracy": 0.8366042831912637,
|
||
|
|
"num_tokens": 95362681.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441864013671875,
|
||
|
|
"epoch": 0.8779527559055118,
|
||
|
|
"grad_norm": 0.7609168209994998,
|
||
|
|
"learning_rate": 1.700159897715624e-05,
|
||
|
|
"loss": 0.4823,
|
||
|
|
"mean_token_accuracy": 0.836696463637054,
|
||
|
|
"num_tokens": 95815007.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452392578125,
|
||
|
|
"epoch": 0.8818897637795275,
|
||
|
|
"grad_norm": 0.8691379438133096,
|
||
|
|
"learning_rate": 1.6970508724662536e-05,
|
||
|
|
"loss": 0.4847,
|
||
|
|
"mean_token_accuracy": 0.8337752502411604,
|
||
|
|
"num_tokens": 96216692.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453155517578125,
|
||
|
|
"epoch": 0.8858267716535433,
|
||
|
|
"grad_norm": 0.9417537213558551,
|
||
|
|
"learning_rate": 1.693928686275871e-05,
|
||
|
|
"loss": 0.4959,
|
||
|
|
"mean_token_accuracy": 0.8307985952124,
|
||
|
|
"num_tokens": 96638434.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.45635986328125,
|
||
|
|
"epoch": 0.889763779527559,
|
||
|
|
"grad_norm": 0.8178474225248942,
|
||
|
|
"learning_rate": 1.6907933980941312e-05,
|
||
|
|
"loss": 0.4803,
|
||
|
|
"mean_token_accuracy": 0.8372792527079582,
|
||
|
|
"num_tokens": 97062650.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448638916015625,
|
||
|
|
"epoch": 0.8937007874015748,
|
||
|
|
"grad_norm": 0.8660818057993216,
|
||
|
|
"learning_rate": 1.6876450671180667e-05,
|
||
|
|
"loss": 0.4875,
|
||
|
|
"mean_token_accuracy": 0.8336331462487578,
|
||
|
|
"num_tokens": 97490818.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.452239990234375,
|
||
|
|
"epoch": 0.8976377952755905,
|
||
|
|
"grad_norm": 0.8601663155835422,
|
||
|
|
"learning_rate": 1.6844837527909682e-05,
|
||
|
|
"loss": 0.4819,
|
||
|
|
"mean_token_accuracy": 0.8355707786977291,
|
||
|
|
"num_tokens": 97912123.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448455810546875,
|
||
|
|
"epoch": 0.9015748031496063,
|
||
|
|
"grad_norm": 0.8792189335920719,
|
||
|
|
"learning_rate": 1.681309514801265e-05,
|
||
|
|
"loss": 0.4795,
|
||
|
|
"mean_token_accuracy": 0.8368852250277996,
|
||
|
|
"num_tokens": 98346951.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.446990966796875,
|
||
|
|
"epoch": 0.905511811023622,
|
||
|
|
"grad_norm": 0.8589439183505942,
|
||
|
|
"learning_rate": 1.6781224130813966e-05,
|
||
|
|
"loss": 0.4751,
|
||
|
|
"mean_token_accuracy": 0.8384532378986478,
|
||
|
|
"num_tokens": 98782373.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44207763671875,
|
||
|
|
"epoch": 0.9094488188976378,
|
||
|
|
"grad_norm": 0.9131583177729585,
|
||
|
|
"learning_rate": 1.6749225078066796e-05,
|
||
|
|
"loss": 0.4732,
|
||
|
|
"mean_token_accuracy": 0.8387185446918011,
|
||
|
|
"num_tokens": 99222036.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442413330078125,
|
||
|
|
"epoch": 0.9133858267716536,
|
||
|
|
"grad_norm": 0.8600516435524386,
|
||
|
|
"learning_rate": 1.6717098593941753e-05,
|
||
|
|
"loss": 0.4793,
|
||
|
|
"mean_token_accuracy": 0.8373282151296735,
|
||
|
|
"num_tokens": 99650218.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440216064453125,
|
||
|
|
"epoch": 0.9173228346456693,
|
||
|
|
"grad_norm": 0.866595517834408,
|
||
|
|
"learning_rate": 1.6684845285015453e-05,
|
||
|
|
"loss": 0.4706,
|
||
|
|
"mean_token_accuracy": 0.8394803171977401,
|
||
|
|
"num_tokens": 100066418.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4385986328125,
|
||
|
|
"epoch": 0.9212598425196851,
|
||
|
|
"grad_norm": 0.8506430586031988,
|
||
|
|
"learning_rate": 1.665246576025908e-05,
|
||
|
|
"loss": 0.4887,
|
||
|
|
"mean_token_accuracy": 0.8329129619523883,
|
||
|
|
"num_tokens": 100509512.0,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.444915771484375,
|
||
|
|
"epoch": 0.9251968503937008,
|
||
|
|
"grad_norm": 0.8365324292407998,
|
||
|
|
"learning_rate": 1.661996063102689e-05,
|
||
|
|
"loss": 0.4746,
|
||
|
|
"mean_token_accuracy": 0.8389232521876693,
|
||
|
|
"num_tokens": 100925854.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4608154296875,
|
||
|
|
"epoch": 0.9291338582677166,
|
||
|
|
"grad_norm": 0.9178339434645408,
|
||
|
|
"learning_rate": 1.658733051104466e-05,
|
||
|
|
"loss": 0.4845,
|
||
|
|
"mean_token_accuracy": 0.8356641856953502,
|
||
|
|
"num_tokens": 101324784.0,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.444549560546875,
|
||
|
|
"epoch": 0.9330708661417323,
|
||
|
|
"grad_norm": 0.8526785526614675,
|
||
|
|
"learning_rate": 1.65545760163981e-05,
|
||
|
|
"loss": 0.4686,
|
||
|
|
"mean_token_accuracy": 0.8390642059966922,
|
||
|
|
"num_tokens": 101737781.0,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442047119140625,
|
||
|
|
"epoch": 0.937007874015748,
|
||
|
|
"grad_norm": 0.8633913144106983,
|
||
|
|
"learning_rate": 1.6521697765521232e-05,
|
||
|
|
"loss": 0.4724,
|
||
|
|
"mean_token_accuracy": 0.8398132715374231,
|
||
|
|
"num_tokens": 102163549.0,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440887451171875,
|
||
|
|
"epoch": 0.9409448818897638,
|
||
|
|
"grad_norm": 0.9052461846043562,
|
||
|
|
"learning_rate": 1.64886963791847e-05,
|
||
|
|
"loss": 0.4707,
|
||
|
|
"mean_token_accuracy": 0.8368164198473096,
|
||
|
|
"num_tokens": 102610992.0,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4443359375,
|
||
|
|
"epoch": 0.9448818897637795,
|
||
|
|
"grad_norm": 0.7465118786788207,
|
||
|
|
"learning_rate": 1.645557248048406e-05,
|
||
|
|
"loss": 0.4678,
|
||
|
|
"mean_token_accuracy": 0.8387394333258271,
|
||
|
|
"num_tokens": 103035316.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44122314453125,
|
||
|
|
"epoch": 0.9488188976377953,
|
||
|
|
"grad_norm": 0.8866699103487138,
|
||
|
|
"learning_rate": 1.642232669482801e-05,
|
||
|
|
"loss": 0.4675,
|
||
|
|
"mean_token_accuracy": 0.8410151610150933,
|
||
|
|
"num_tokens": 103473084.0,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43780517578125,
|
||
|
|
"epoch": 0.952755905511811,
|
||
|
|
"grad_norm": 0.7937264780981268,
|
||
|
|
"learning_rate": 1.6388959649926567e-05,
|
||
|
|
"loss": 0.4732,
|
||
|
|
"mean_token_accuracy": 0.8379040919244289,
|
||
|
|
"num_tokens": 103902168.0,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44012451171875,
|
||
|
|
"epoch": 0.9566929133858267,
|
||
|
|
"grad_norm": 0.7697616947989054,
|
||
|
|
"learning_rate": 1.6355471975779255e-05,
|
||
|
|
"loss": 0.4592,
|
||
|
|
"mean_token_accuracy": 0.8422065414488316,
|
||
|
|
"num_tokens": 104326224.0,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.439117431640625,
|
||
|
|
"epoch": 0.9606299212598425,
|
||
|
|
"grad_norm": 0.7598725141371296,
|
||
|
|
"learning_rate": 1.6321864304663174e-05,
|
||
|
|
"loss": 0.4707,
|
||
|
|
"mean_token_accuracy": 0.8379590632393956,
|
||
|
|
"num_tokens": 104764846.0,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441436767578125,
|
||
|
|
"epoch": 0.9645669291338582,
|
||
|
|
"grad_norm": 0.7779917047277318,
|
||
|
|
"learning_rate": 1.6288137271121066e-05,
|
||
|
|
"loss": 0.4792,
|
||
|
|
"mean_token_accuracy": 0.8374456604942679,
|
||
|
|
"num_tokens": 105184708.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.439361572265625,
|
||
|
|
"epoch": 0.968503937007874,
|
||
|
|
"grad_norm": 0.8225151639761566,
|
||
|
|
"learning_rate": 1.6254291511949353e-05,
|
||
|
|
"loss": 0.472,
|
||
|
|
"mean_token_accuracy": 0.8368659280240536,
|
||
|
|
"num_tokens": 105616981.0,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.458038330078125,
|
||
|
|
"epoch": 0.9724409448818898,
|
||
|
|
"grad_norm": 0.8481725978257536,
|
||
|
|
"learning_rate": 1.62203276661861e-05,
|
||
|
|
"loss": 0.4583,
|
||
|
|
"mean_token_accuracy": 0.84222552459687,
|
||
|
|
"num_tokens": 106023465.0,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4459228515625,
|
||
|
|
"epoch": 0.9763779527559056,
|
||
|
|
"grad_norm": 0.8204906148542874,
|
||
|
|
"learning_rate": 1.618624637509895e-05,
|
||
|
|
"loss": 0.4653,
|
||
|
|
"mean_token_accuracy": 0.842322469688952,
|
||
|
|
"num_tokens": 106446966.0,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449951171875,
|
||
|
|
"epoch": 0.9803149606299213,
|
||
|
|
"grad_norm": 0.7515720467344851,
|
||
|
|
"learning_rate": 1.615204828217302e-05,
|
||
|
|
"loss": 0.4818,
|
||
|
|
"mean_token_accuracy": 0.8364261239767075,
|
||
|
|
"num_tokens": 106898608.0,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448944091796875,
|
||
|
|
"epoch": 0.984251968503937,
|
||
|
|
"grad_norm": 0.8467560345004221,
|
||
|
|
"learning_rate": 1.6117734033098744e-05,
|
||
|
|
"loss": 0.4605,
|
||
|
|
"mean_token_accuracy": 0.8424621764570475,
|
||
|
|
"num_tokens": 107332517.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.449981689453125,
|
||
|
|
"epoch": 0.9881889763779528,
|
||
|
|
"grad_norm": 0.7946487404168803,
|
||
|
|
"learning_rate": 1.60833042757597e-05,
|
||
|
|
"loss": 0.4609,
|
||
|
|
"mean_token_accuracy": 0.8438995983451605,
|
||
|
|
"num_tokens": 107756302.0,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453155517578125,
|
||
|
|
"epoch": 0.9921259842519685,
|
||
|
|
"grad_norm": 0.797337818033653,
|
||
|
|
"learning_rate": 1.604875966022035e-05,
|
||
|
|
"loss": 0.4702,
|
||
|
|
"mean_token_accuracy": 0.8400178952142596,
|
||
|
|
"num_tokens": 108189062.0,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.468475341796875,
|
||
|
|
"epoch": 0.9960629921259843,
|
||
|
|
"grad_norm": 0.8572234329705702,
|
||
|
|
"learning_rate": 1.6014100838713796e-05,
|
||
|
|
"loss": 0.4686,
|
||
|
|
"mean_token_accuracy": 0.8383585326373577,
|
||
|
|
"num_tokens": 108597149.0,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4630126953125,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 0.8143644791410944,
|
||
|
|
"learning_rate": 1.5979328465629435e-05,
|
||
|
|
"loss": 0.4634,
|
||
|
|
"mean_token_accuracy": 0.8411326240748167,
|
||
|
|
"num_tokens": 109014820.0,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.461944580078125,
|
||
|
|
"epoch": 1.0039370078740157,
|
||
|
|
"grad_norm": 0.7954464559368906,
|
||
|
|
"learning_rate": 1.5944443197500633e-05,
|
||
|
|
"loss": 0.4338,
|
||
|
|
"mean_token_accuracy": 0.8487551026046276,
|
||
|
|
"num_tokens": 109433802.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4547119140625,
|
||
|
|
"epoch": 1.0078740157480315,
|
||
|
|
"grad_norm": 0.8003325077804105,
|
||
|
|
"learning_rate": 1.59094456929923e-05,
|
||
|
|
"loss": 0.4411,
|
||
|
|
"mean_token_accuracy": 0.8471461059525609,
|
||
|
|
"num_tokens": 109861074.0,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.444732666015625,
|
||
|
|
"epoch": 1.0118110236220472,
|
||
|
|
"grad_norm": 0.8193521250103798,
|
||
|
|
"learning_rate": 1.5874336612888487e-05,
|
||
|
|
"loss": 0.4529,
|
||
|
|
"mean_token_accuracy": 0.8427032921463251,
|
||
|
|
"num_tokens": 110311771.0,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.445892333984375,
|
||
|
|
"epoch": 1.015748031496063,
|
||
|
|
"grad_norm": 0.8092675680016121,
|
||
|
|
"learning_rate": 1.5839116620079874e-05,
|
||
|
|
"loss": 0.4455,
|
||
|
|
"mean_token_accuracy": 0.8459412306547165,
|
||
|
|
"num_tokens": 110748553.0,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.453948974609375,
|
||
|
|
"epoch": 1.0196850393700787,
|
||
|
|
"grad_norm": 0.832494554347309,
|
||
|
|
"learning_rate": 1.580378637955128e-05,
|
||
|
|
"loss": 0.4401,
|
||
|
|
"mean_token_accuracy": 0.8475761925801635,
|
||
|
|
"num_tokens": 111156318.0,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.451507568359375,
|
||
|
|
"epoch": 1.0236220472440944,
|
||
|
|
"grad_norm": 0.773496689384206,
|
||
|
|
"learning_rate": 1.5768346558369105e-05,
|
||
|
|
"loss": 0.4306,
|
||
|
|
"mean_token_accuracy": 0.8483728105202317,
|
||
|
|
"num_tokens": 111582549.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44976806640625,
|
||
|
|
"epoch": 1.0275590551181102,
|
||
|
|
"grad_norm": 0.7754814952822502,
|
||
|
|
"learning_rate": 1.5732797825668714e-05,
|
||
|
|
"loss": 0.4365,
|
||
|
|
"mean_token_accuracy": 0.8502898076549172,
|
||
|
|
"num_tokens": 111998521.0,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448455810546875,
|
||
|
|
"epoch": 1.031496062992126,
|
||
|
|
"grad_norm": 0.8112260492156416,
|
||
|
|
"learning_rate": 1.5697140852641835e-05,
|
||
|
|
"loss": 0.4466,
|
||
|
|
"mean_token_accuracy": 0.8444716399535537,
|
||
|
|
"num_tokens": 112418889.0,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443206787109375,
|
||
|
|
"epoch": 1.0354330708661417,
|
||
|
|
"grad_norm": 0.792232440371198,
|
||
|
|
"learning_rate": 1.5661376312523854e-05,
|
||
|
|
"loss": 0.4454,
|
||
|
|
"mean_token_accuracy": 0.8456090791150928,
|
||
|
|
"num_tokens": 112847214.0,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44891357421875,
|
||
|
|
"epoch": 1.0393700787401574,
|
||
|
|
"grad_norm": 0.8601941273573265,
|
||
|
|
"learning_rate": 1.5625504880581136e-05,
|
||
|
|
"loss": 0.4437,
|
||
|
|
"mean_token_accuracy": 0.8455395260825753,
|
||
|
|
"num_tokens": 113289373.0,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44927978515625,
|
||
|
|
"epoch": 1.0433070866141732,
|
||
|
|
"grad_norm": 0.7714822661259353,
|
||
|
|
"learning_rate": 1.5589527234098247e-05,
|
||
|
|
"loss": 0.4456,
|
||
|
|
"mean_token_accuracy": 0.8444314748048782,
|
||
|
|
"num_tokens": 113723545.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442169189453125,
|
||
|
|
"epoch": 1.047244094488189,
|
||
|
|
"grad_norm": 0.8190193432749275,
|
||
|
|
"learning_rate": 1.5553444052365176e-05,
|
||
|
|
"loss": 0.4325,
|
||
|
|
"mean_token_accuracy": 0.8491929210722446,
|
||
|
|
"num_tokens": 114162192.0,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441619873046875,
|
||
|
|
"epoch": 1.0511811023622046,
|
||
|
|
"grad_norm": 0.7349029400939283,
|
||
|
|
"learning_rate": 1.5517256016664524e-05,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"mean_token_accuracy": 0.8510750867426395,
|
||
|
|
"num_tokens": 114571282.0,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441680908203125,
|
||
|
|
"epoch": 1.0551181102362204,
|
||
|
|
"grad_norm": 0.7903067387402938,
|
||
|
|
"learning_rate": 1.5480963810258614e-05,
|
||
|
|
"loss": 0.4254,
|
||
|
|
"mean_token_accuracy": 0.8503110585734248,
|
||
|
|
"num_tokens": 115010760.0,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437774658203125,
|
||
|
|
"epoch": 1.0590551181102361,
|
||
|
|
"grad_norm": 0.8078264014108711,
|
||
|
|
"learning_rate": 1.5444568118376615e-05,
|
||
|
|
"loss": 0.4266,
|
||
|
|
"mean_token_accuracy": 0.8510611765086651,
|
||
|
|
"num_tokens": 115428842.0,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43804931640625,
|
||
|
|
"epoch": 1.0629921259842519,
|
||
|
|
"grad_norm": 0.7113878989173036,
|
||
|
|
"learning_rate": 1.5408069628201597e-05,
|
||
|
|
"loss": 0.4411,
|
||
|
|
"mean_token_accuracy": 0.8480326728895307,
|
||
|
|
"num_tokens": 115858145.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4405517578125,
|
||
|
|
"epoch": 1.0669291338582678,
|
||
|
|
"grad_norm": 0.7629159309151201,
|
||
|
|
"learning_rate": 1.5371469028857534e-05,
|
||
|
|
"loss": 0.44,
|
||
|
|
"mean_token_accuracy": 0.8475739294663072,
|
||
|
|
"num_tokens": 116286624.0,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43695068359375,
|
||
|
|
"epoch": 1.0708661417322836,
|
||
|
|
"grad_norm": 0.727617304934651,
|
||
|
|
"learning_rate": 1.533476701139633e-05,
|
||
|
|
"loss": 0.4364,
|
||
|
|
"mean_token_accuracy": 0.8470908785238862,
|
||
|
|
"num_tokens": 116713316.0,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427978515625,
|
||
|
|
"epoch": 1.0748031496062993,
|
||
|
|
"grad_norm": 0.6824270776024675,
|
||
|
|
"learning_rate": 1.5297964268784757e-05,
|
||
|
|
"loss": 0.445,
|
||
|
|
"mean_token_accuracy": 0.8450411949306726,
|
||
|
|
"num_tokens": 117167235.0,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432647705078125,
|
||
|
|
"epoch": 1.078740157480315,
|
||
|
|
"grad_norm": 0.7312979875445866,
|
||
|
|
"learning_rate": 1.5261061495891345e-05,
|
||
|
|
"loss": 0.4315,
|
||
|
|
"mean_token_accuracy": 0.8500176034867764,
|
||
|
|
"num_tokens": 117613634.0,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442596435546875,
|
||
|
|
"epoch": 1.0826771653543308,
|
||
|
|
"grad_norm": 0.8021478759232983,
|
||
|
|
"learning_rate": 1.5224059389473305e-05,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"mean_token_accuracy": 0.8529012883082032,
|
||
|
|
"num_tokens": 118016077.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430023193359375,
|
||
|
|
"epoch": 1.0866141732283465,
|
||
|
|
"grad_norm": 0.7358571235840625,
|
||
|
|
"learning_rate": 1.5186958648163344e-05,
|
||
|
|
"loss": 0.4444,
|
||
|
|
"mean_token_accuracy": 0.8468069788068533,
|
||
|
|
"num_tokens": 118481042.0,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4302978515625,
|
||
|
|
"epoch": 1.0905511811023623,
|
||
|
|
"grad_norm": 0.7025837618217786,
|
||
|
|
"learning_rate": 1.514975997245649e-05,
|
||
|
|
"loss": 0.4256,
|
||
|
|
"mean_token_accuracy": 0.8535407232120633,
|
||
|
|
"num_tokens": 118916540.0,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427886962890625,
|
||
|
|
"epoch": 1.094488188976378,
|
||
|
|
"grad_norm": 0.7256948688401913,
|
||
|
|
"learning_rate": 1.5112464064696857e-05,
|
||
|
|
"loss": 0.4287,
|
||
|
|
"mean_token_accuracy": 0.8516247281804681,
|
||
|
|
"num_tokens": 119355413.0,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.428436279296875,
|
||
|
|
"epoch": 1.0984251968503937,
|
||
|
|
"grad_norm": 0.7780711790160693,
|
||
|
|
"learning_rate": 1.5075071629064381e-05,
|
||
|
|
"loss": 0.4472,
|
||
|
|
"mean_token_accuracy": 0.8435391476377845,
|
||
|
|
"num_tokens": 119789004.0,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432952880859375,
|
||
|
|
"epoch": 1.1023622047244095,
|
||
|
|
"grad_norm": 0.7299240812653669,
|
||
|
|
"learning_rate": 1.5037583371561538e-05,
|
||
|
|
"loss": 0.4361,
|
||
|
|
"mean_token_accuracy": 0.8491124296560884,
|
||
|
|
"num_tokens": 120207170.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43359375,
|
||
|
|
"epoch": 1.1062992125984252,
|
||
|
|
"grad_norm": 0.7409579348895512,
|
||
|
|
"learning_rate": 1.5000000000000002e-05,
|
||
|
|
"loss": 0.4435,
|
||
|
|
"mean_token_accuracy": 0.8463100017979741,
|
||
|
|
"num_tokens": 120639252.0,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4307861328125,
|
||
|
|
"epoch": 1.110236220472441,
|
||
|
|
"grad_norm": 0.7628848757746238,
|
||
|
|
"learning_rate": 1.4962322223987284e-05,
|
||
|
|
"loss": 0.4293,
|
||
|
|
"mean_token_accuracy": 0.8522774102166295,
|
||
|
|
"num_tokens": 121066062.0,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433319091796875,
|
||
|
|
"epoch": 1.1141732283464567,
|
||
|
|
"grad_norm": 0.819524267296068,
|
||
|
|
"learning_rate": 1.4924550754913341e-05,
|
||
|
|
"loss": 0.4334,
|
||
|
|
"mean_token_accuracy": 0.8487180238589644,
|
||
|
|
"num_tokens": 121494245.0,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424285888671875,
|
||
|
|
"epoch": 1.1181102362204725,
|
||
|
|
"grad_norm": 0.7201240645893007,
|
||
|
|
"learning_rate": 1.4886686305937133e-05,
|
||
|
|
"loss": 0.4231,
|
||
|
|
"mean_token_accuracy": 0.8517216173931956,
|
||
|
|
"num_tokens": 121939800.0,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424102783203125,
|
||
|
|
"epoch": 1.1220472440944882,
|
||
|
|
"grad_norm": 0.7187534396317979,
|
||
|
|
"learning_rate": 1.4848729591973165e-05,
|
||
|
|
"loss": 0.4276,
|
||
|
|
"mean_token_accuracy": 0.8503343118354678,
|
||
|
|
"num_tokens": 122376313.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4246826171875,
|
||
|
|
"epoch": 1.125984251968504,
|
||
|
|
"grad_norm": 0.7377358272265221,
|
||
|
|
"learning_rate": 1.4810681329677988e-05,
|
||
|
|
"loss": 0.4319,
|
||
|
|
"mean_token_accuracy": 0.8497972404584289,
|
||
|
|
"num_tokens": 122805350.0,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42626953125,
|
||
|
|
"epoch": 1.1299212598425197,
|
||
|
|
"grad_norm": 1.0338835019493653,
|
||
|
|
"learning_rate": 1.477254223743666e-05,
|
||
|
|
"loss": 0.4286,
|
||
|
|
"mean_token_accuracy": 0.8504892103374004,
|
||
|
|
"num_tokens": 123229585.0,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430145263671875,
|
||
|
|
"epoch": 1.1338582677165354,
|
||
|
|
"grad_norm": 0.7549593653526081,
|
||
|
|
"learning_rate": 1.4734313035349205e-05,
|
||
|
|
"loss": 0.4157,
|
||
|
|
"mean_token_accuracy": 0.8539745900779963,
|
||
|
|
"num_tokens": 123667139.0,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42559814453125,
|
||
|
|
"epoch": 1.1377952755905512,
|
||
|
|
"grad_norm": 0.7134584103671152,
|
||
|
|
"learning_rate": 1.4695994445216985e-05,
|
||
|
|
"loss": 0.4429,
|
||
|
|
"mean_token_accuracy": 0.8458532355725765,
|
||
|
|
"num_tokens": 124098824.0,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433685302734375,
|
||
|
|
"epoch": 1.141732283464567,
|
||
|
|
"grad_norm": 0.7330700557433506,
|
||
|
|
"learning_rate": 1.4657587190529099e-05,
|
||
|
|
"loss": 0.4262,
|
||
|
|
"mean_token_accuracy": 0.8504032399505377,
|
||
|
|
"num_tokens": 124515502.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429229736328125,
|
||
|
|
"epoch": 1.1456692913385826,
|
||
|
|
"grad_norm": 0.7185743079970203,
|
||
|
|
"learning_rate": 1.4619091996448703e-05,
|
||
|
|
"loss": 0.4283,
|
||
|
|
"mean_token_accuracy": 0.8502658074721694,
|
||
|
|
"num_tokens": 124940837.0,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442840576171875,
|
||
|
|
"epoch": 1.1496062992125984,
|
||
|
|
"grad_norm": 0.7104408412257786,
|
||
|
|
"learning_rate": 1.458050958979933e-05,
|
||
|
|
"loss": 0.4236,
|
||
|
|
"mean_token_accuracy": 0.8509505931288004,
|
||
|
|
"num_tokens": 125365604.0,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4324951171875,
|
||
|
|
"epoch": 1.1535433070866141,
|
||
|
|
"grad_norm": 0.7048242096715929,
|
||
|
|
"learning_rate": 1.4541840699051168e-05,
|
||
|
|
"loss": 0.4348,
|
||
|
|
"mean_token_accuracy": 0.8513169949874282,
|
||
|
|
"num_tokens": 125806425.0,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.439178466796875,
|
||
|
|
"epoch": 1.1574803149606299,
|
||
|
|
"grad_norm": 0.7362993621651959,
|
||
|
|
"learning_rate": 1.4503086054307299e-05,
|
||
|
|
"loss": 0.4319,
|
||
|
|
"mean_token_accuracy": 0.8472118573263288,
|
||
|
|
"num_tokens": 126232534.0,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437774658203125,
|
||
|
|
"epoch": 1.1614173228346456,
|
||
|
|
"grad_norm": 0.7408531510746268,
|
||
|
|
"learning_rate": 1.4464246387289913e-05,
|
||
|
|
"loss": 0.4302,
|
||
|
|
"mean_token_accuracy": 0.8503606310114264,
|
||
|
|
"num_tokens": 126682871.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44171142578125,
|
||
|
|
"epoch": 1.1653543307086613,
|
||
|
|
"grad_norm": 0.6950128099475602,
|
||
|
|
"learning_rate": 1.4425322431326504e-05,
|
||
|
|
"loss": 0.4388,
|
||
|
|
"mean_token_accuracy": 0.8464201996102929,
|
||
|
|
"num_tokens": 127113863.0,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44354248046875,
|
||
|
|
"epoch": 1.169291338582677,
|
||
|
|
"grad_norm": 0.7118074965909784,
|
||
|
|
"learning_rate": 1.438631492133601e-05,
|
||
|
|
"loss": 0.4127,
|
||
|
|
"mean_token_accuracy": 0.8560283463448286,
|
||
|
|
"num_tokens": 127533345.0,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44549560546875,
|
||
|
|
"epoch": 1.1732283464566928,
|
||
|
|
"grad_norm": 0.7536132862966228,
|
||
|
|
"learning_rate": 1.4347224593814946e-05,
|
||
|
|
"loss": 0.4319,
|
||
|
|
"mean_token_accuracy": 0.8498925063759089,
|
||
|
|
"num_tokens": 127948674.0,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.448089599609375,
|
||
|
|
"epoch": 1.1771653543307086,
|
||
|
|
"grad_norm": 0.7828383574148693,
|
||
|
|
"learning_rate": 1.4308052186823494e-05,
|
||
|
|
"loss": 0.4221,
|
||
|
|
"mean_token_accuracy": 0.8534400537610054,
|
||
|
|
"num_tokens": 128362416.0,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43896484375,
|
||
|
|
"epoch": 1.1811023622047245,
|
||
|
|
"grad_norm": 0.7094320191090657,
|
||
|
|
"learning_rate": 1.4268798439971572e-05,
|
||
|
|
"loss": 0.4291,
|
||
|
|
"mean_token_accuracy": 0.8505124570801854,
|
||
|
|
"num_tokens": 128812262.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431182861328125,
|
||
|
|
"epoch": 1.1850393700787403,
|
||
|
|
"grad_norm": 0.7150260895657441,
|
||
|
|
"learning_rate": 1.4229464094404866e-05,
|
||
|
|
"loss": 0.4327,
|
||
|
|
"mean_token_accuracy": 0.850852720439434,
|
||
|
|
"num_tokens": 129267041.0,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437042236328125,
|
||
|
|
"epoch": 1.188976377952756,
|
||
|
|
"grad_norm": 0.7384859056056637,
|
||
|
|
"learning_rate": 1.4190049892790838e-05,
|
||
|
|
"loss": 0.4278,
|
||
|
|
"mean_token_accuracy": 0.8514499422162771,
|
||
|
|
"num_tokens": 129701771.0,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.442169189453125,
|
||
|
|
"epoch": 1.1929133858267718,
|
||
|
|
"grad_norm": 0.7886399080694347,
|
||
|
|
"learning_rate": 1.4150556579304699e-05,
|
||
|
|
"loss": 0.442,
|
||
|
|
"mean_token_accuracy": 0.8480188464745879,
|
||
|
|
"num_tokens": 130124702.0,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441558837890625,
|
||
|
|
"epoch": 1.1968503937007875,
|
||
|
|
"grad_norm": 0.7466370557730762,
|
||
|
|
"learning_rate": 1.4110984899615367e-05,
|
||
|
|
"loss": 0.4191,
|
||
|
|
"mean_token_accuracy": 0.8521094862371683,
|
||
|
|
"num_tokens": 130538776.0,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.436798095703125,
|
||
|
|
"epoch": 1.2007874015748032,
|
||
|
|
"grad_norm": 0.8104033828186294,
|
||
|
|
"learning_rate": 1.4071335600871388e-05,
|
||
|
|
"loss": 0.4228,
|
||
|
|
"mean_token_accuracy": 0.8536786120384932,
|
||
|
|
"num_tokens": 130979212.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.443572998046875,
|
||
|
|
"epoch": 1.204724409448819,
|
||
|
|
"grad_norm": 0.7850350920921043,
|
||
|
|
"learning_rate": 1.4031609431686809e-05,
|
||
|
|
"loss": 0.4163,
|
||
|
|
"mean_token_accuracy": 0.8537906985729933,
|
||
|
|
"num_tokens": 131405428.0,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.441162109375,
|
||
|
|
"epoch": 1.2086614173228347,
|
||
|
|
"grad_norm": 0.7762261376252096,
|
||
|
|
"learning_rate": 1.3991807142127082e-05,
|
||
|
|
"loss": 0.4339,
|
||
|
|
"mean_token_accuracy": 0.8480509323999286,
|
||
|
|
"num_tokens": 131837961.0,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4342041015625,
|
||
|
|
"epoch": 1.2125984251968505,
|
||
|
|
"grad_norm": 0.6805963341437536,
|
||
|
|
"learning_rate": 1.3951929483694855e-05,
|
||
|
|
"loss": 0.4219,
|
||
|
|
"mean_token_accuracy": 0.8525160830467939,
|
||
|
|
"num_tokens": 132267303.0,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431793212890625,
|
||
|
|
"epoch": 1.2165354330708662,
|
||
|
|
"grad_norm": 0.7879655600731412,
|
||
|
|
"learning_rate": 1.3911977209315828e-05,
|
||
|
|
"loss": 0.4412,
|
||
|
|
"mean_token_accuracy": 0.8463876061141491,
|
||
|
|
"num_tokens": 132723994.0,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43023681640625,
|
||
|
|
"epoch": 1.220472440944882,
|
||
|
|
"grad_norm": 0.7753424379604112,
|
||
|
|
"learning_rate": 1.3871951073324508e-05,
|
||
|
|
"loss": 0.4229,
|
||
|
|
"mean_token_accuracy": 0.8535848595201969,
|
||
|
|
"num_tokens": 133172478.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43377685546875,
|
||
|
|
"epoch": 1.2244094488188977,
|
||
|
|
"grad_norm": 0.7633773162167979,
|
||
|
|
"learning_rate": 1.3831851831449973e-05,
|
||
|
|
"loss": 0.4372,
|
||
|
|
"mean_token_accuracy": 0.8473470462486148,
|
||
|
|
"num_tokens": 133631582.0,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437347412109375,
|
||
|
|
"epoch": 1.2283464566929134,
|
||
|
|
"grad_norm": 0.7490203495280126,
|
||
|
|
"learning_rate": 1.3791680240801608e-05,
|
||
|
|
"loss": 0.4253,
|
||
|
|
"mean_token_accuracy": 0.8518748395144939,
|
||
|
|
"num_tokens": 134067377.0,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437225341796875,
|
||
|
|
"epoch": 1.2322834645669292,
|
||
|
|
"grad_norm": 0.7821105835443857,
|
||
|
|
"learning_rate": 1.3751437059854809e-05,
|
||
|
|
"loss": 0.43,
|
||
|
|
"mean_token_accuracy": 0.8515894012525678,
|
||
|
|
"num_tokens": 134491172.0,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.440277099609375,
|
||
|
|
"epoch": 1.236220472440945,
|
||
|
|
"grad_norm": 0.7967419864814663,
|
||
|
|
"learning_rate": 1.3711123048436652e-05,
|
||
|
|
"loss": 0.4194,
|
||
|
|
"mean_token_accuracy": 0.8529211021959782,
|
||
|
|
"num_tokens": 134898327.0,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432159423828125,
|
||
|
|
"epoch": 1.2401574803149606,
|
||
|
|
"grad_norm": 0.7914552161401749,
|
||
|
|
"learning_rate": 1.3670738967711566e-05,
|
||
|
|
"loss": 0.421,
|
||
|
|
"mean_token_accuracy": 0.8497829381376505,
|
||
|
|
"num_tokens": 135323334.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43121337890625,
|
||
|
|
"epoch": 1.2440944881889764,
|
||
|
|
"grad_norm": 0.7866132156657194,
|
||
|
|
"learning_rate": 1.3630285580166946e-05,
|
||
|
|
"loss": 0.4255,
|
||
|
|
"mean_token_accuracy": 0.8531960425898433,
|
||
|
|
"num_tokens": 135755228.0,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426910400390625,
|
||
|
|
"epoch": 1.2480314960629921,
|
||
|
|
"grad_norm": 0.770938614701006,
|
||
|
|
"learning_rate": 1.358976364959876e-05,
|
||
|
|
"loss": 0.4332,
|
||
|
|
"mean_token_accuracy": 0.8500475706532598,
|
||
|
|
"num_tokens": 136203741.0,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41986083984375,
|
||
|
|
"epoch": 1.2519685039370079,
|
||
|
|
"grad_norm": 0.6865132743346913,
|
||
|
|
"learning_rate": 1.3549173941097134e-05,
|
||
|
|
"loss": 0.4131,
|
||
|
|
"mean_token_accuracy": 0.8563643284142017,
|
||
|
|
"num_tokens": 136627448.0,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42071533203125,
|
||
|
|
"epoch": 1.2559055118110236,
|
||
|
|
"grad_norm": 0.76446995637985,
|
||
|
|
"learning_rate": 1.3508517221031898e-05,
|
||
|
|
"loss": 0.4306,
|
||
|
|
"mean_token_accuracy": 0.8519408302381635,
|
||
|
|
"num_tokens": 137055328.0,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418701171875,
|
||
|
|
"epoch": 1.2598425196850394,
|
||
|
|
"grad_norm": 0.7091617898089253,
|
||
|
|
"learning_rate": 1.346779425703812e-05,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"mean_token_accuracy": 0.8539134385064244,
|
||
|
|
"num_tokens": 137484547.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423858642578125,
|
||
|
|
"epoch": 1.263779527559055,
|
||
|
|
"grad_norm": 0.7176796997298532,
|
||
|
|
"learning_rate": 1.3427005818001615e-05,
|
||
|
|
"loss": 0.4299,
|
||
|
|
"mean_token_accuracy": 0.8518371032550931,
|
||
|
|
"num_tokens": 137902620.0,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423004150390625,
|
||
|
|
"epoch": 1.2677165354330708,
|
||
|
|
"grad_norm": 0.7495725876408128,
|
||
|
|
"learning_rate": 1.3386152674044421e-05,
|
||
|
|
"loss": 0.4316,
|
||
|
|
"mean_token_accuracy": 0.8498124582692981,
|
||
|
|
"num_tokens": 138337754.0,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41943359375,
|
||
|
|
"epoch": 1.2716535433070866,
|
||
|
|
"grad_norm": 0.7128450005810961,
|
||
|
|
"learning_rate": 1.334523559651027e-05,
|
||
|
|
"loss": 0.4182,
|
||
|
|
"mean_token_accuracy": 0.8539979690685868,
|
||
|
|
"num_tokens": 138765177.0,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429351806640625,
|
||
|
|
"epoch": 1.2755905511811023,
|
||
|
|
"grad_norm": 0.7646983127030109,
|
||
|
|
"learning_rate": 1.3304255357950004e-05,
|
||
|
|
"loss": 0.4144,
|
||
|
|
"mean_token_accuracy": 0.8527556182816625,
|
||
|
|
"num_tokens": 139191798.0,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.419830322265625,
|
||
|
|
"epoch": 1.279527559055118,
|
||
|
|
"grad_norm": 0.720920386611709,
|
||
|
|
"learning_rate": 1.3263212732107014e-05,
|
||
|
|
"loss": 0.4307,
|
||
|
|
"mean_token_accuracy": 0.8504047309979796,
|
||
|
|
"num_tokens": 139628968.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42205810546875,
|
||
|
|
"epoch": 1.2834645669291338,
|
||
|
|
"grad_norm": 0.7638329733373392,
|
||
|
|
"learning_rate": 1.3222108493902613e-05,
|
||
|
|
"loss": 0.4227,
|
||
|
|
"mean_token_accuracy": 0.8527109837159514,
|
||
|
|
"num_tokens": 140048601.0,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422271728515625,
|
||
|
|
"epoch": 1.2874015748031495,
|
||
|
|
"grad_norm": 0.7211271314435947,
|
||
|
|
"learning_rate": 1.3180943419421409e-05,
|
||
|
|
"loss": 0.4166,
|
||
|
|
"mean_token_accuracy": 0.8536871457472444,
|
||
|
|
"num_tokens": 140465892.0,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415863037109375,
|
||
|
|
"epoch": 1.2913385826771653,
|
||
|
|
"grad_norm": 0.6870608297675647,
|
||
|
|
"learning_rate": 1.3139718285896657e-05,
|
||
|
|
"loss": 0.4196,
|
||
|
|
"mean_token_accuracy": 0.854581861756742,
|
||
|
|
"num_tokens": 140899011.0,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4163818359375,
|
||
|
|
"epoch": 1.295275590551181,
|
||
|
|
"grad_norm": 0.7349072604607171,
|
||
|
|
"learning_rate": 1.3098433871695572e-05,
|
||
|
|
"loss": 0.4247,
|
||
|
|
"mean_token_accuracy": 0.8518269741907716,
|
||
|
|
"num_tokens": 141327992.0,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.413848876953125,
|
||
|
|
"epoch": 1.2992125984251968,
|
||
|
|
"grad_norm": 0.7355537771461196,
|
||
|
|
"learning_rate": 1.305709095630466e-05,
|
||
|
|
"loss": 0.4277,
|
||
|
|
"mean_token_accuracy": 0.850621142424643,
|
||
|
|
"num_tokens": 141766293.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420196533203125,
|
||
|
|
"epoch": 1.3031496062992125,
|
||
|
|
"grad_norm": 0.676410365989162,
|
||
|
|
"learning_rate": 1.3015690320314952e-05,
|
||
|
|
"loss": 0.4117,
|
||
|
|
"mean_token_accuracy": 0.8557650512084365,
|
||
|
|
"num_tokens": 142186747.0,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418426513671875,
|
||
|
|
"epoch": 1.3070866141732282,
|
||
|
|
"grad_norm": 0.7156171897978183,
|
||
|
|
"learning_rate": 1.2974232745407326e-05,
|
||
|
|
"loss": 0.4005,
|
||
|
|
"mean_token_accuracy": 0.8567338529974222,
|
||
|
|
"num_tokens": 142604960.0,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426544189453125,
|
||
|
|
"epoch": 1.311023622047244,
|
||
|
|
"grad_norm": 0.6649803220610805,
|
||
|
|
"learning_rate": 1.2932719014337697e-05,
|
||
|
|
"loss": 0.4207,
|
||
|
|
"mean_token_accuracy": 0.8537461366504431,
|
||
|
|
"num_tokens": 143019812.0,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4180908203125,
|
||
|
|
"epoch": 1.3149606299212597,
|
||
|
|
"grad_norm": 0.6991751691443175,
|
||
|
|
"learning_rate": 1.2891149910922267e-05,
|
||
|
|
"loss": 0.4184,
|
||
|
|
"mean_token_accuracy": 0.8524497682228684,
|
||
|
|
"num_tokens": 143476442.0,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417327880859375,
|
||
|
|
"epoch": 1.3188976377952755,
|
||
|
|
"grad_norm": 0.7211632281848889,
|
||
|
|
"learning_rate": 1.2849526220022713e-05,
|
||
|
|
"loss": 0.4192,
|
||
|
|
"mean_token_accuracy": 0.8534535896033049,
|
||
|
|
"num_tokens": 143905945.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42132568359375,
|
||
|
|
"epoch": 1.3228346456692912,
|
||
|
|
"grad_norm": 0.7616936561270807,
|
||
|
|
"learning_rate": 1.2807848727531372e-05,
|
||
|
|
"loss": 0.4269,
|
||
|
|
"mean_token_accuracy": 0.8515564789995551,
|
||
|
|
"num_tokens": 144328854.0,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4254150390625,
|
||
|
|
"epoch": 1.326771653543307,
|
||
|
|
"grad_norm": 0.6751865858547865,
|
||
|
|
"learning_rate": 1.276611822035641e-05,
|
||
|
|
"loss": 0.4183,
|
||
|
|
"mean_token_accuracy": 0.8517815675586462,
|
||
|
|
"num_tokens": 144767518.0,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42669677734375,
|
||
|
|
"epoch": 1.330708661417323,
|
||
|
|
"grad_norm": 0.7290990437060788,
|
||
|
|
"learning_rate": 1.2724335486406947e-05,
|
||
|
|
"loss": 0.4058,
|
||
|
|
"mean_token_accuracy": 0.8585278857499361,
|
||
|
|
"num_tokens": 145204450.0,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421051025390625,
|
||
|
|
"epoch": 1.3346456692913387,
|
||
|
|
"grad_norm": 0.687664495765818,
|
||
|
|
"learning_rate": 1.26825013145782e-05,
|
||
|
|
"loss": 0.4086,
|
||
|
|
"mean_token_accuracy": 0.8543958617374301,
|
||
|
|
"num_tokens": 145650203.0,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417572021484375,
|
||
|
|
"epoch": 1.3385826771653544,
|
||
|
|
"grad_norm": 0.746553500387901,
|
||
|
|
"learning_rate": 1.264061649473657e-05,
|
||
|
|
"loss": 0.4309,
|
||
|
|
"mean_token_accuracy": 0.8497815914452076,
|
||
|
|
"num_tokens": 146086535.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420928955078125,
|
||
|
|
"epoch": 1.3425196850393701,
|
||
|
|
"grad_norm": 0.642370025581711,
|
||
|
|
"learning_rate": 1.2598681817704755e-05,
|
||
|
|
"loss": 0.4232,
|
||
|
|
"mean_token_accuracy": 0.8529811156913638,
|
||
|
|
"num_tokens": 146525758.0,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43072509765625,
|
||
|
|
"epoch": 1.3464566929133859,
|
||
|
|
"grad_norm": 0.6619628860223508,
|
||
|
|
"learning_rate": 1.2556698075246776e-05,
|
||
|
|
"loss": 0.4163,
|
||
|
|
"mean_token_accuracy": 0.853263552300632,
|
||
|
|
"num_tokens": 146949125.0,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420806884765625,
|
||
|
|
"epoch": 1.3503937007874016,
|
||
|
|
"grad_norm": 0.7236645387749685,
|
||
|
|
"learning_rate": 1.2514666060053075e-05,
|
||
|
|
"loss": 0.426,
|
||
|
|
"mean_token_accuracy": 0.8519914764910936,
|
||
|
|
"num_tokens": 147387665.0,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42279052734375,
|
||
|
|
"epoch": 1.3543307086614174,
|
||
|
|
"grad_norm": 0.7020684815127656,
|
||
|
|
"learning_rate": 1.2472586565725513e-05,
|
||
|
|
"loss": 0.4075,
|
||
|
|
"mean_token_accuracy": 0.8560398099943995,
|
||
|
|
"num_tokens": 147814355.0,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41729736328125,
|
||
|
|
"epoch": 1.358267716535433,
|
||
|
|
"grad_norm": 0.7351973042533226,
|
||
|
|
"learning_rate": 1.2430460386762406e-05,
|
||
|
|
"loss": 0.4176,
|
||
|
|
"mean_token_accuracy": 0.8536938540637493,
|
||
|
|
"num_tokens": 148258418.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4271240234375,
|
||
|
|
"epoch": 1.3622047244094488,
|
||
|
|
"grad_norm": 0.7201567319054922,
|
||
|
|
"learning_rate": 1.2388288318543513e-05,
|
||
|
|
"loss": 0.4225,
|
||
|
|
"mean_token_accuracy": 0.8535017529502511,
|
||
|
|
"num_tokens": 148709126.0,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430023193359375,
|
||
|
|
"epoch": 1.3661417322834646,
|
||
|
|
"grad_norm": 0.7437528228507801,
|
||
|
|
"learning_rate": 1.2346071157315026e-05,
|
||
|
|
"loss": 0.4164,
|
||
|
|
"mean_token_accuracy": 0.8525076750665903,
|
||
|
|
"num_tokens": 149129878.0,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43548583984375,
|
||
|
|
"epoch": 1.3700787401574803,
|
||
|
|
"grad_norm": 0.7497143090625151,
|
||
|
|
"learning_rate": 1.230380970017453e-05,
|
||
|
|
"loss": 0.4273,
|
||
|
|
"mean_token_accuracy": 0.85127994697541,
|
||
|
|
"num_tokens": 149546730.0,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42578125,
|
||
|
|
"epoch": 1.374015748031496,
|
||
|
|
"grad_norm": 0.741499648414104,
|
||
|
|
"learning_rate": 1.2261504745055963e-05,
|
||
|
|
"loss": 0.4188,
|
||
|
|
"mean_token_accuracy": 0.8544770767912269,
|
||
|
|
"num_tokens": 149964392.0,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426513671875,
|
||
|
|
"epoch": 1.3779527559055118,
|
||
|
|
"grad_norm": 0.699150853468733,
|
||
|
|
"learning_rate": 1.2219157090714536e-05,
|
||
|
|
"loss": 0.4203,
|
||
|
|
"mean_token_accuracy": 0.8539074826985598,
|
||
|
|
"num_tokens": 150387746.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43170166015625,
|
||
|
|
"epoch": 1.3818897637795275,
|
||
|
|
"grad_norm": 0.7799990935872828,
|
||
|
|
"learning_rate": 1.2176767536711658e-05,
|
||
|
|
"loss": 0.4148,
|
||
|
|
"mean_token_accuracy": 0.8564818482846022,
|
||
|
|
"num_tokens": 150818559.0,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42596435546875,
|
||
|
|
"epoch": 1.3858267716535433,
|
||
|
|
"grad_norm": 0.680008208583702,
|
||
|
|
"learning_rate": 1.2134336883399855e-05,
|
||
|
|
"loss": 0.4068,
|
||
|
|
"mean_token_accuracy": 0.8564103506505489,
|
||
|
|
"num_tokens": 151239247.0,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422210693359375,
|
||
|
|
"epoch": 1.389763779527559,
|
||
|
|
"grad_norm": 0.7126291861119709,
|
||
|
|
"learning_rate": 1.2091865931907627e-05,
|
||
|
|
"loss": 0.4151,
|
||
|
|
"mean_token_accuracy": 0.8551490902900696,
|
||
|
|
"num_tokens": 151671201.0,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429656982421875,
|
||
|
|
"epoch": 1.3937007874015748,
|
||
|
|
"grad_norm": 0.7386831790798457,
|
||
|
|
"learning_rate": 1.2049355484124351e-05,
|
||
|
|
"loss": 0.4214,
|
||
|
|
"mean_token_accuracy": 0.8525898391380906,
|
||
|
|
"num_tokens": 152092308.0,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42755126953125,
|
||
|
|
"epoch": 1.3976377952755905,
|
||
|
|
"grad_norm": 0.6966152016027973,
|
||
|
|
"learning_rate": 1.2006806342685127e-05,
|
||
|
|
"loss": 0.4244,
|
||
|
|
"mean_token_accuracy": 0.8513237368315458,
|
||
|
|
"num_tokens": 152521483.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42974853515625,
|
||
|
|
"epoch": 1.4015748031496063,
|
||
|
|
"grad_norm": 0.6830368381973837,
|
||
|
|
"learning_rate": 1.196421931095562e-05,
|
||
|
|
"loss": 0.4075,
|
||
|
|
"mean_token_accuracy": 0.8552977237850428,
|
||
|
|
"num_tokens": 152951062.0,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4261474609375,
|
||
|
|
"epoch": 1.405511811023622,
|
||
|
|
"grad_norm": 0.7312814209734928,
|
||
|
|
"learning_rate": 1.1921595193016905e-05,
|
||
|
|
"loss": 0.4078,
|
||
|
|
"mean_token_accuracy": 0.8583087539300323,
|
||
|
|
"num_tokens": 153368483.0,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4244384765625,
|
||
|
|
"epoch": 1.4094488188976377,
|
||
|
|
"grad_norm": 0.6835796438866236,
|
||
|
|
"learning_rate": 1.1878934793650273e-05,
|
||
|
|
"loss": 0.4146,
|
||
|
|
"mean_token_accuracy": 0.8542582355439663,
|
||
|
|
"num_tokens": 153791350.0,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41558837890625,
|
||
|
|
"epoch": 1.4133858267716535,
|
||
|
|
"grad_norm": 0.6586015598075508,
|
||
|
|
"learning_rate": 1.1836238918322041e-05,
|
||
|
|
"loss": 0.4094,
|
||
|
|
"mean_token_accuracy": 0.855911853723228,
|
||
|
|
"num_tokens": 154243369.0,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425201416015625,
|
||
|
|
"epoch": 1.4173228346456692,
|
||
|
|
"grad_norm": 0.6850251308250546,
|
||
|
|
"learning_rate": 1.1793508373168346e-05,
|
||
|
|
"loss": 0.4108,
|
||
|
|
"mean_token_accuracy": 0.8540153652429581,
|
||
|
|
"num_tokens": 154653609.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427581787109375,
|
||
|
|
"epoch": 1.421259842519685,
|
||
|
|
"grad_norm": 0.6685207920133769,
|
||
|
|
"learning_rate": 1.1750743964979919e-05,
|
||
|
|
"loss": 0.4191,
|
||
|
|
"mean_token_accuracy": 0.8522771028801799,
|
||
|
|
"num_tokens": 155070913.0,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416900634765625,
|
||
|
|
"epoch": 1.425196850393701,
|
||
|
|
"grad_norm": 0.7561179827701838,
|
||
|
|
"learning_rate": 1.1707946501186853e-05,
|
||
|
|
"loss": 0.4167,
|
||
|
|
"mean_token_accuracy": 0.8548763170838356,
|
||
|
|
"num_tokens": 155529833.0,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422332763671875,
|
||
|
|
"epoch": 1.4291338582677167,
|
||
|
|
"grad_norm": 0.6786764165360469,
|
||
|
|
"learning_rate": 1.1665116789843376e-05,
|
||
|
|
"loss": 0.412,
|
||
|
|
"mean_token_accuracy": 0.8565678047016263,
|
||
|
|
"num_tokens": 155974428.0,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4224853515625,
|
||
|
|
"epoch": 1.4330708661417324,
|
||
|
|
"grad_norm": 0.6451845587584786,
|
||
|
|
"learning_rate": 1.1622255639612553e-05,
|
||
|
|
"loss": 0.4125,
|
||
|
|
"mean_token_accuracy": 0.8562461519613862,
|
||
|
|
"num_tokens": 156403714.0,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418853759765625,
|
||
|
|
"epoch": 1.4370078740157481,
|
||
|
|
"grad_norm": 0.6849351310551496,
|
||
|
|
"learning_rate": 1.1579363859751069e-05,
|
||
|
|
"loss": 0.4234,
|
||
|
|
"mean_token_accuracy": 0.8534889034926891,
|
||
|
|
"num_tokens": 156832734.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41510009765625,
|
||
|
|
"epoch": 1.4409448818897639,
|
||
|
|
"grad_norm": 0.7306208881773922,
|
||
|
|
"learning_rate": 1.1536442260093908e-05,
|
||
|
|
"loss": 0.4125,
|
||
|
|
"mean_token_accuracy": 0.8564850222319365,
|
||
|
|
"num_tokens": 157293224.0,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42584228515625,
|
||
|
|
"epoch": 1.4448818897637796,
|
||
|
|
"grad_norm": 0.6623778060860301,
|
||
|
|
"learning_rate": 1.1493491651039077e-05,
|
||
|
|
"loss": 0.4085,
|
||
|
|
"mean_token_accuracy": 0.8566481098532677,
|
||
|
|
"num_tokens": 157714322.0,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42413330078125,
|
||
|
|
"epoch": 1.4488188976377954,
|
||
|
|
"grad_norm": 0.6686515862412451,
|
||
|
|
"learning_rate": 1.1450512843532315e-05,
|
||
|
|
"loss": 0.4232,
|
||
|
|
"mean_token_accuracy": 0.8522218987345695,
|
||
|
|
"num_tokens": 158153426.0,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431060791015625,
|
||
|
|
"epoch": 1.452755905511811,
|
||
|
|
"grad_norm": 0.6883001908644174,
|
||
|
|
"learning_rate": 1.140750664905177e-05,
|
||
|
|
"loss": 0.4226,
|
||
|
|
"mean_token_accuracy": 0.8529786402359605,
|
||
|
|
"num_tokens": 158578195.0,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423797607421875,
|
||
|
|
"epoch": 1.4566929133858268,
|
||
|
|
"grad_norm": 0.6606135266079957,
|
||
|
|
"learning_rate": 1.1364473879592674e-05,
|
||
|
|
"loss": 0.413,
|
||
|
|
"mean_token_accuracy": 0.8547450276091695,
|
||
|
|
"num_tokens": 159000177.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4185791015625,
|
||
|
|
"epoch": 1.4606299212598426,
|
||
|
|
"grad_norm": 0.7149594414525444,
|
||
|
|
"learning_rate": 1.1321415347652031e-05,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"mean_token_accuracy": 0.8579149143770337,
|
||
|
|
"num_tokens": 159434631.0,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421356201171875,
|
||
|
|
"epoch": 1.4645669291338583,
|
||
|
|
"grad_norm": 0.7071013927213717,
|
||
|
|
"learning_rate": 1.1278331866213253e-05,
|
||
|
|
"loss": 0.3968,
|
||
|
|
"mean_token_accuracy": 0.8596271779388189,
|
||
|
|
"num_tokens": 159850666.0,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420806884765625,
|
||
|
|
"epoch": 1.468503937007874,
|
||
|
|
"grad_norm": 0.8505216784835236,
|
||
|
|
"learning_rate": 1.1235224248730821e-05,
|
||
|
|
"loss": 0.4221,
|
||
|
|
"mean_token_accuracy": 0.8532926142215729,
|
||
|
|
"num_tokens": 160304984.0,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422149658203125,
|
||
|
|
"epoch": 1.4724409448818898,
|
||
|
|
"grad_norm": 0.7819391261612006,
|
||
|
|
"learning_rate": 1.1192093309114933e-05,
|
||
|
|
"loss": 0.4048,
|
||
|
|
"mean_token_accuracy": 0.8586824173107743,
|
||
|
|
"num_tokens": 160717222.0,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.434478759765625,
|
||
|
|
"epoch": 1.4763779527559056,
|
||
|
|
"grad_norm": 0.6951485933891031,
|
||
|
|
"learning_rate": 1.1148939861716124e-05,
|
||
|
|
"loss": 0.3963,
|
||
|
|
"mean_token_accuracy": 0.8589826161041856,
|
||
|
|
"num_tokens": 161120940.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.428924560546875,
|
||
|
|
"epoch": 1.4803149606299213,
|
||
|
|
"grad_norm": 0.7048568705927686,
|
||
|
|
"learning_rate": 1.11057647213099e-05,
|
||
|
|
"loss": 0.3902,
|
||
|
|
"mean_token_accuracy": 0.8643534425646067,
|
||
|
|
"num_tokens": 161534008.0,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43157958984375,
|
||
|
|
"epoch": 1.484251968503937,
|
||
|
|
"grad_norm": 0.6722355315921953,
|
||
|
|
"learning_rate": 1.1062568703081345e-05,
|
||
|
|
"loss": 0.4055,
|
||
|
|
"mean_token_accuracy": 0.8573996061459184,
|
||
|
|
"num_tokens": 161960631.0,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4361572265625,
|
||
|
|
"epoch": 1.4881889763779528,
|
||
|
|
"grad_norm": 0.7490074817348712,
|
||
|
|
"learning_rate": 1.1019352622609739e-05,
|
||
|
|
"loss": 0.4032,
|
||
|
|
"mean_token_accuracy": 0.8587049478664994,
|
||
|
|
"num_tokens": 162370841.0,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43182373046875,
|
||
|
|
"epoch": 1.4921259842519685,
|
||
|
|
"grad_norm": 0.7287285943859506,
|
||
|
|
"learning_rate": 1.0976117295853155e-05,
|
||
|
|
"loss": 0.4025,
|
||
|
|
"mean_token_accuracy": 0.8602571506053209,
|
||
|
|
"num_tokens": 162777887.0,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4241943359375,
|
||
|
|
"epoch": 1.4960629921259843,
|
||
|
|
"grad_norm": 0.7026020806956982,
|
||
|
|
"learning_rate": 1.093286353913305e-05,
|
||
|
|
"loss": 0.4161,
|
||
|
|
"mean_token_accuracy": 0.8565910197794437,
|
||
|
|
"num_tokens": 163218983.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433135986328125,
|
||
|
|
"epoch": 1.5,
|
||
|
|
"grad_norm": 0.7204054324234719,
|
||
|
|
"learning_rate": 1.0889592169118857e-05,
|
||
|
|
"loss": 0.3933,
|
||
|
|
"mean_token_accuracy": 0.8597572650760412,
|
||
|
|
"num_tokens": 163623329.0,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427337646484375,
|
||
|
|
"epoch": 1.5039370078740157,
|
||
|
|
"grad_norm": 0.6903686768507965,
|
||
|
|
"learning_rate": 1.0846304002812564e-05,
|
||
|
|
"loss": 0.4033,
|
||
|
|
"mean_token_accuracy": 0.858160094358027,
|
||
|
|
"num_tokens": 164045642.0,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421173095703125,
|
||
|
|
"epoch": 1.5078740157480315,
|
||
|
|
"grad_norm": 0.6980585646431972,
|
||
|
|
"learning_rate": 1.0802999857533288e-05,
|
||
|
|
"loss": 0.4081,
|
||
|
|
"mean_token_accuracy": 0.8575689736753702,
|
||
|
|
"num_tokens": 164478955.0,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42193603515625,
|
||
|
|
"epoch": 1.5118110236220472,
|
||
|
|
"grad_norm": 0.9737017821239141,
|
||
|
|
"learning_rate": 1.0759680550901843e-05,
|
||
|
|
"loss": 0.4136,
|
||
|
|
"mean_token_accuracy": 0.8535346928983927,
|
||
|
|
"num_tokens": 164909956.0,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430023193359375,
|
||
|
|
"epoch": 1.515748031496063,
|
||
|
|
"grad_norm": 0.7106007919465092,
|
||
|
|
"learning_rate": 1.0716346900825298e-05,
|
||
|
|
"loss": 0.3999,
|
||
|
|
"mean_token_accuracy": 0.8596385335549712,
|
||
|
|
"num_tokens": 165331198.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42340087890625,
|
||
|
|
"epoch": 1.5196850393700787,
|
||
|
|
"grad_norm": 0.6820846944502266,
|
||
|
|
"learning_rate": 1.0672999725481549e-05,
|
||
|
|
"loss": 0.4079,
|
||
|
|
"mean_token_accuracy": 0.8579740738496184,
|
||
|
|
"num_tokens": 165758143.0,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42694091796875,
|
||
|
|
"epoch": 1.5236220472440944,
|
||
|
|
"grad_norm": 0.7542107896226161,
|
||
|
|
"learning_rate": 1.0629639843303857e-05,
|
||
|
|
"loss": 0.4011,
|
||
|
|
"mean_token_accuracy": 0.8585777133703232,
|
||
|
|
"num_tokens": 166173128.0,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421539306640625,
|
||
|
|
"epoch": 1.5275590551181102,
|
||
|
|
"grad_norm": 0.6609543935957453,
|
||
|
|
"learning_rate": 1.0586268072965395e-05,
|
||
|
|
"loss": 0.4126,
|
||
|
|
"mean_token_accuracy": 0.856118586845696,
|
||
|
|
"num_tokens": 166625567.0,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426116943359375,
|
||
|
|
"epoch": 1.531496062992126,
|
||
|
|
"grad_norm": 0.6894576484238519,
|
||
|
|
"learning_rate": 1.0542885233363797e-05,
|
||
|
|
"loss": 0.4006,
|
||
|
|
"mean_token_accuracy": 0.8593427939340472,
|
||
|
|
"num_tokens": 167051168.0,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426666259765625,
|
||
|
|
"epoch": 1.5354330708661417,
|
||
|
|
"grad_norm": 0.6806968879939391,
|
||
|
|
"learning_rate": 1.0499492143605698e-05,
|
||
|
|
"loss": 0.4015,
|
||
|
|
"mean_token_accuracy": 0.8581527229398489,
|
||
|
|
"num_tokens": 167486244.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425384521484375,
|
||
|
|
"epoch": 1.5393700787401574,
|
||
|
|
"grad_norm": 0.7586136635597545,
|
||
|
|
"learning_rate": 1.0456089622991264e-05,
|
||
|
|
"loss": 0.4226,
|
||
|
|
"mean_token_accuracy": 0.8518975591287017,
|
||
|
|
"num_tokens": 167928276.0,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42388916015625,
|
||
|
|
"epoch": 1.5433070866141732,
|
||
|
|
"grad_norm": 0.6526633786320399,
|
||
|
|
"learning_rate": 1.0412678490998717e-05,
|
||
|
|
"loss": 0.4031,
|
||
|
|
"mean_token_accuracy": 0.8584257867187262,
|
||
|
|
"num_tokens": 168355746.0,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4183349609375,
|
||
|
|
"epoch": 1.547244094488189,
|
||
|
|
"grad_norm": 0.6770265837460369,
|
||
|
|
"learning_rate": 1.0369259567268882e-05,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"mean_token_accuracy": 0.8612643834203482,
|
||
|
|
"num_tokens": 168798129.0,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42486572265625,
|
||
|
|
"epoch": 1.5511811023622046,
|
||
|
|
"grad_norm": 0.6920016700595446,
|
||
|
|
"learning_rate": 1.0325833671589687e-05,
|
||
|
|
"loss": 0.3995,
|
||
|
|
"mean_token_accuracy": 0.8574335686862469,
|
||
|
|
"num_tokens": 169217374.0,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42083740234375,
|
||
|
|
"epoch": 1.5551181102362204,
|
||
|
|
"grad_norm": 0.7056602515336996,
|
||
|
|
"learning_rate": 1.0282401623880704e-05,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"mean_token_accuracy": 0.8575547644868493,
|
||
|
|
"num_tokens": 169656947.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42193603515625,
|
||
|
|
"epoch": 1.5590551181102361,
|
||
|
|
"grad_norm": 0.6909377706505936,
|
||
|
|
"learning_rate": 1.0238964244177657e-05,
|
||
|
|
"loss": 0.4048,
|
||
|
|
"mean_token_accuracy": 0.858373093418777,
|
||
|
|
"num_tokens": 170077729.0,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425323486328125,
|
||
|
|
"epoch": 1.5629921259842519,
|
||
|
|
"grad_norm": 0.7117437962313572,
|
||
|
|
"learning_rate": 1.0195522352616942e-05,
|
||
|
|
"loss": 0.4132,
|
||
|
|
"mean_token_accuracy": 0.8565368922427297,
|
||
|
|
"num_tokens": 170499116.0,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423797607421875,
|
||
|
|
"epoch": 1.5669291338582676,
|
||
|
|
"grad_norm": 0.7009070523233998,
|
||
|
|
"learning_rate": 1.0152076769420153e-05,
|
||
|
|
"loss": 0.3969,
|
||
|
|
"mean_token_accuracy": 0.8602678831666708,
|
||
|
|
"num_tokens": 170919221.0,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42755126953125,
|
||
|
|
"epoch": 1.5708661417322833,
|
||
|
|
"grad_norm": 0.6952782642754524,
|
||
|
|
"learning_rate": 1.0108628314878572e-05,
|
||
|
|
"loss": 0.4128,
|
||
|
|
"mean_token_accuracy": 0.8561601033434272,
|
||
|
|
"num_tokens": 171346417.0,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431640625,
|
||
|
|
"epoch": 1.574803149606299,
|
||
|
|
"grad_norm": 0.6787762500080776,
|
||
|
|
"learning_rate": 1.0065177809337703e-05,
|
||
|
|
"loss": 0.3997,
|
||
|
|
"mean_token_accuracy": 0.8580770511180162,
|
||
|
|
"num_tokens": 171771196.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42333984375,
|
||
|
|
"epoch": 1.5787401574803148,
|
||
|
|
"grad_norm": 0.6641496005676443,
|
||
|
|
"learning_rate": 1.002172607318177e-05,
|
||
|
|
"loss": 0.3952,
|
||
|
|
"mean_token_accuracy": 0.8605634858831763,
|
||
|
|
"num_tokens": 172208563.0,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42852783203125,
|
||
|
|
"epoch": 1.5826771653543306,
|
||
|
|
"grad_norm": 0.6711002341847971,
|
||
|
|
"learning_rate": 9.978273926818233e-06,
|
||
|
|
"loss": 0.4041,
|
||
|
|
"mean_token_accuracy": 0.8582491222769022,
|
||
|
|
"num_tokens": 172622056.0,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4249267578125,
|
||
|
|
"epoch": 1.5866141732283463,
|
||
|
|
"grad_norm": 0.6591812100767495,
|
||
|
|
"learning_rate": 9.934822190662299e-06,
|
||
|
|
"loss": 0.4133,
|
||
|
|
"mean_token_accuracy": 0.8562856521457434,
|
||
|
|
"num_tokens": 173072118.0,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424102783203125,
|
||
|
|
"epoch": 1.590551181102362,
|
||
|
|
"grad_norm": 0.670607342101713,
|
||
|
|
"learning_rate": 9.89137168512143e-06,
|
||
|
|
"loss": 0.4036,
|
||
|
|
"mean_token_accuracy": 0.8581979488953948,
|
||
|
|
"num_tokens": 173501197.0,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42413330078125,
|
||
|
|
"epoch": 1.594488188976378,
|
||
|
|
"grad_norm": 0.7082529364693488,
|
||
|
|
"learning_rate": 9.847923230579848e-06,
|
||
|
|
"loss": 0.4006,
|
||
|
|
"mean_token_accuracy": 0.8585393913090229,
|
||
|
|
"num_tokens": 173938951.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432098388671875,
|
||
|
|
"epoch": 1.5984251968503937,
|
||
|
|
"grad_norm": 0.7505153208274351,
|
||
|
|
"learning_rate": 9.804477647383061e-06,
|
||
|
|
"loss": 0.4051,
|
||
|
|
"mean_token_accuracy": 0.8565549207851291,
|
||
|
|
"num_tokens": 174344014.0,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425811767578125,
|
||
|
|
"epoch": 1.6023622047244095,
|
||
|
|
"grad_norm": 0.6819153612915562,
|
||
|
|
"learning_rate": 9.761035755822347e-06,
|
||
|
|
"loss": 0.3974,
|
||
|
|
"mean_token_accuracy": 0.8594374163076282,
|
||
|
|
"num_tokens": 174767953.0,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426727294921875,
|
||
|
|
"epoch": 1.6062992125984252,
|
||
|
|
"grad_norm": 0.661234527207505,
|
||
|
|
"learning_rate": 9.717598376119301e-06,
|
||
|
|
"loss": 0.4028,
|
||
|
|
"mean_token_accuracy": 0.858024075627327,
|
||
|
|
"num_tokens": 175190002.0,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42169189453125,
|
||
|
|
"epoch": 1.610236220472441,
|
||
|
|
"grad_norm": 0.678400946836127,
|
||
|
|
"learning_rate": 9.674166328410318e-06,
|
||
|
|
"loss": 0.4057,
|
||
|
|
"mean_token_accuracy": 0.8575535602867603,
|
||
|
|
"num_tokens": 175637779.0,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429595947265625,
|
||
|
|
"epoch": 1.6141732283464567,
|
||
|
|
"grad_norm": 0.6484799686944109,
|
||
|
|
"learning_rate": 9.630740432731123e-06,
|
||
|
|
"loss": 0.396,
|
||
|
|
"mean_token_accuracy": 0.8635142697021365,
|
||
|
|
"num_tokens": 176063629.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432220458984375,
|
||
|
|
"epoch": 1.6181102362204725,
|
||
|
|
"grad_norm": 0.6562134804867598,
|
||
|
|
"learning_rate": 9.587321509001288e-06,
|
||
|
|
"loss": 0.4129,
|
||
|
|
"mean_token_accuracy": 0.8563750553876162,
|
||
|
|
"num_tokens": 176489720.0,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429534912109375,
|
||
|
|
"epoch": 1.6220472440944882,
|
||
|
|
"grad_norm": 0.619336896640731,
|
||
|
|
"learning_rate": 9.543910377008741e-06,
|
||
|
|
"loss": 0.4094,
|
||
|
|
"mean_token_accuracy": 0.859122664667666,
|
||
|
|
"num_tokens": 176932692.0,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4376220703125,
|
||
|
|
"epoch": 1.625984251968504,
|
||
|
|
"grad_norm": 0.6608522160653064,
|
||
|
|
"learning_rate": 9.5005078563943e-06,
|
||
|
|
"loss": 0.3992,
|
||
|
|
"mean_token_accuracy": 0.8599905716255307,
|
||
|
|
"num_tokens": 177345218.0,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.44317626953125,
|
||
|
|
"epoch": 1.6299212598425197,
|
||
|
|
"grad_norm": 0.6802186143674549,
|
||
|
|
"learning_rate": 9.457114766636203e-06,
|
||
|
|
"loss": 0.4074,
|
||
|
|
"mean_token_accuracy": 0.8593994919210672,
|
||
|
|
"num_tokens": 177769367.0,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42620849609375,
|
||
|
|
"epoch": 1.6338582677165354,
|
||
|
|
"grad_norm": 0.6424723588977458,
|
||
|
|
"learning_rate": 9.413731927034607e-06,
|
||
|
|
"loss": 0.3942,
|
||
|
|
"mean_token_accuracy": 0.8619447741657495,
|
||
|
|
"num_tokens": 178213481.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.431243896484375,
|
||
|
|
"epoch": 1.6377952755905512,
|
||
|
|
"grad_norm": 0.6273568531212063,
|
||
|
|
"learning_rate": 9.370360156696143e-06,
|
||
|
|
"loss": 0.4009,
|
||
|
|
"mean_token_accuracy": 0.8575205830857158,
|
||
|
|
"num_tokens": 178649774.0,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.43670654296875,
|
||
|
|
"epoch": 1.641732283464567,
|
||
|
|
"grad_norm": 0.6791296212575202,
|
||
|
|
"learning_rate": 9.327000274518453e-06,
|
||
|
|
"loss": 0.4069,
|
||
|
|
"mean_token_accuracy": 0.8575573619455099,
|
||
|
|
"num_tokens": 179067613.0,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.433624267578125,
|
||
|
|
"epoch": 1.6456692913385826,
|
||
|
|
"grad_norm": 0.6728543531262471,
|
||
|
|
"learning_rate": 9.283653099174704e-06,
|
||
|
|
"loss": 0.4207,
|
||
|
|
"mean_token_accuracy": 0.8519806191325188,
|
||
|
|
"num_tokens": 179499205.0,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.437164306640625,
|
||
|
|
"epoch": 1.6496062992125984,
|
||
|
|
"grad_norm": 0.649958590187722,
|
||
|
|
"learning_rate": 9.24031944909816e-06,
|
||
|
|
"loss": 0.3958,
|
||
|
|
"mean_token_accuracy": 0.8601369233801961,
|
||
|
|
"num_tokens": 179929093.0,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.434326171875,
|
||
|
|
"epoch": 1.6535433070866141,
|
||
|
|
"grad_norm": 0.6737577071985799,
|
||
|
|
"learning_rate": 9.197000142466715e-06,
|
||
|
|
"loss": 0.4059,
|
||
|
|
"mean_token_accuracy": 0.8580764941871166,
|
||
|
|
"num_tokens": 180389799.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.432830810546875,
|
||
|
|
"epoch": 1.65748031496063,
|
||
|
|
"grad_norm": 0.6492707624195736,
|
||
|
|
"learning_rate": 9.15369599718744e-06,
|
||
|
|
"loss": 0.3915,
|
||
|
|
"mean_token_accuracy": 0.8608764903619885,
|
||
|
|
"num_tokens": 180820852.0,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4285888671875,
|
||
|
|
"epoch": 1.6614173228346458,
|
||
|
|
"grad_norm": 0.6603995867326616,
|
||
|
|
"learning_rate": 9.110407830881146e-06,
|
||
|
|
"loss": 0.381,
|
||
|
|
"mean_token_accuracy": 0.8649307256564498,
|
||
|
|
"num_tokens": 181247065.0,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421783447265625,
|
||
|
|
"epoch": 1.6653543307086616,
|
||
|
|
"grad_norm": 0.6781308462951336,
|
||
|
|
"learning_rate": 9.067136460866954e-06,
|
||
|
|
"loss": 0.4085,
|
||
|
|
"mean_token_accuracy": 0.8567010900005698,
|
||
|
|
"num_tokens": 181685279.0,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423583984375,
|
||
|
|
"epoch": 1.6692913385826773,
|
||
|
|
"grad_norm": 0.6770679227525491,
|
||
|
|
"learning_rate": 9.023882704146848e-06,
|
||
|
|
"loss": 0.3951,
|
||
|
|
"mean_token_accuracy": 0.8610436161980033,
|
||
|
|
"num_tokens": 182096431.0,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427703857421875,
|
||
|
|
"epoch": 1.673228346456693,
|
||
|
|
"grad_norm": 0.7056744478166704,
|
||
|
|
"learning_rate": 8.980647377390263e-06,
|
||
|
|
"loss": 0.4031,
|
||
|
|
"mean_token_accuracy": 0.8577226242050529,
|
||
|
|
"num_tokens": 182526927.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423248291015625,
|
||
|
|
"epoch": 1.6771653543307088,
|
||
|
|
"grad_norm": 0.6929097942926685,
|
||
|
|
"learning_rate": 8.937431296918658e-06,
|
||
|
|
"loss": 0.3962,
|
||
|
|
"mean_token_accuracy": 0.8589769685640931,
|
||
|
|
"num_tokens": 182948540.0,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426116943359375,
|
||
|
|
"epoch": 1.6811023622047245,
|
||
|
|
"grad_norm": 0.6513465373807524,
|
||
|
|
"learning_rate": 8.894235278690104e-06,
|
||
|
|
"loss": 0.396,
|
||
|
|
"mean_token_accuracy": 0.8591319024562836,
|
||
|
|
"num_tokens": 183370596.0,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.436279296875,
|
||
|
|
"epoch": 1.6850393700787403,
|
||
|
|
"grad_norm": 0.6697597022943024,
|
||
|
|
"learning_rate": 8.85106013828388e-06,
|
||
|
|
"loss": 0.3925,
|
||
|
|
"mean_token_accuracy": 0.8593562422320247,
|
||
|
|
"num_tokens": 183754398.0,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42852783203125,
|
||
|
|
"epoch": 1.688976377952756,
|
||
|
|
"grad_norm": 0.6752483884886014,
|
||
|
|
"learning_rate": 8.80790669088507e-06,
|
||
|
|
"loss": 0.3984,
|
||
|
|
"mean_token_accuracy": 0.8606278160586953,
|
||
|
|
"num_tokens": 184176837.0,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425018310546875,
|
||
|
|
"epoch": 1.6929133858267718,
|
||
|
|
"grad_norm": 0.666204242518818,
|
||
|
|
"learning_rate": 8.764775751269184e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"mean_token_accuracy": 0.8630478298291564,
|
||
|
|
"num_tokens": 184600270.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421630859375,
|
||
|
|
"epoch": 1.6968503937007875,
|
||
|
|
"grad_norm": 0.6686378991246827,
|
||
|
|
"learning_rate": 8.721668133786752e-06,
|
||
|
|
"loss": 0.3942,
|
||
|
|
"mean_token_accuracy": 0.8612590469419956,
|
||
|
|
"num_tokens": 185036728.0,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4161376953125,
|
||
|
|
"epoch": 1.7007874015748032,
|
||
|
|
"grad_norm": 0.7354074820468248,
|
||
|
|
"learning_rate": 8.678584652347974e-06,
|
||
|
|
"loss": 0.4132,
|
||
|
|
"mean_token_accuracy": 0.8553070295602083,
|
||
|
|
"num_tokens": 185474069.0,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426055908203125,
|
||
|
|
"epoch": 1.704724409448819,
|
||
|
|
"grad_norm": 0.7695082385707869,
|
||
|
|
"learning_rate": 8.63552612040733e-06,
|
||
|
|
"loss": 0.4045,
|
||
|
|
"mean_token_accuracy": 0.8592064557597041,
|
||
|
|
"num_tokens": 185892235.0,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42218017578125,
|
||
|
|
"epoch": 1.7086614173228347,
|
||
|
|
"grad_norm": 0.6611227329939291,
|
||
|
|
"learning_rate": 8.592493350948237e-06,
|
||
|
|
"loss": 0.3902,
|
||
|
|
"mean_token_accuracy": 0.8618718609213829,
|
||
|
|
"num_tokens": 186316312.0,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4195556640625,
|
||
|
|
"epoch": 1.7125984251968505,
|
||
|
|
"grad_norm": 0.7015841661068855,
|
||
|
|
"learning_rate": 8.549487156467691e-06,
|
||
|
|
"loss": 0.3939,
|
||
|
|
"mean_token_accuracy": 0.8596938429400325,
|
||
|
|
"num_tokens": 186742337.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420379638671875,
|
||
|
|
"epoch": 1.7165354330708662,
|
||
|
|
"grad_norm": 0.7060512471368613,
|
||
|
|
"learning_rate": 8.506508348960924e-06,
|
||
|
|
"loss": 0.3865,
|
||
|
|
"mean_token_accuracy": 0.8619933761656284,
|
||
|
|
"num_tokens": 187178664.0,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42059326171875,
|
||
|
|
"epoch": 1.720472440944882,
|
||
|
|
"grad_norm": 0.6387961887270812,
|
||
|
|
"learning_rate": 8.463557739906094e-06,
|
||
|
|
"loss": 0.3926,
|
||
|
|
"mean_token_accuracy": 0.8623356893658638,
|
||
|
|
"num_tokens": 187604003.0,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416229248046875,
|
||
|
|
"epoch": 1.7244094488188977,
|
||
|
|
"grad_norm": 0.672058128974811,
|
||
|
|
"learning_rate": 8.42063614024893e-06,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"mean_token_accuracy": 0.859029428102076,
|
||
|
|
"num_tokens": 188050204.0,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417755126953125,
|
||
|
|
"epoch": 1.7283464566929134,
|
||
|
|
"grad_norm": 0.7264979233715146,
|
||
|
|
"learning_rate": 8.377744360387447e-06,
|
||
|
|
"loss": 0.396,
|
||
|
|
"mean_token_accuracy": 0.8614224148914218,
|
||
|
|
"num_tokens": 188482220.0,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41650390625,
|
||
|
|
"epoch": 1.7322834645669292,
|
||
|
|
"grad_norm": 0.673741771374478,
|
||
|
|
"learning_rate": 8.334883210156629e-06,
|
||
|
|
"loss": 0.3869,
|
||
|
|
"mean_token_accuracy": 0.8622207688167691,
|
||
|
|
"num_tokens": 188913952.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418426513671875,
|
||
|
|
"epoch": 1.736220472440945,
|
||
|
|
"grad_norm": 0.6980231663029676,
|
||
|
|
"learning_rate": 8.292053498813149e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"mean_token_accuracy": 0.8624574858695269,
|
||
|
|
"num_tokens": 189328708.0,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4112548828125,
|
||
|
|
"epoch": 1.7401574803149606,
|
||
|
|
"grad_norm": 0.7061589196986463,
|
||
|
|
"learning_rate": 8.249256035020086e-06,
|
||
|
|
"loss": 0.3915,
|
||
|
|
"mean_token_accuracy": 0.8615807592868805,
|
||
|
|
"num_tokens": 189795592.0,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418060302734375,
|
||
|
|
"epoch": 1.7440944881889764,
|
||
|
|
"grad_norm": 0.6641651313861893,
|
||
|
|
"learning_rate": 8.20649162683166e-06,
|
||
|
|
"loss": 0.3904,
|
||
|
|
"mean_token_accuracy": 0.8617926817387342,
|
||
|
|
"num_tokens": 190225732.0,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41802978515625,
|
||
|
|
"epoch": 1.7480314960629921,
|
||
|
|
"grad_norm": 0.877405172892866,
|
||
|
|
"learning_rate": 8.163761081677962e-06,
|
||
|
|
"loss": 0.4026,
|
||
|
|
"mean_token_accuracy": 0.8586821621283889,
|
||
|
|
"num_tokens": 190657380.0,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417144775390625,
|
||
|
|
"epoch": 1.7519685039370079,
|
||
|
|
"grad_norm": 0.7049436005394818,
|
||
|
|
"learning_rate": 8.12106520634973e-06,
|
||
|
|
"loss": 0.3976,
|
||
|
|
"mean_token_accuracy": 0.8597701685503125,
|
||
|
|
"num_tokens": 191094942.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430206298828125,
|
||
|
|
"epoch": 1.7559055118110236,
|
||
|
|
"grad_norm": 0.6749513439611118,
|
||
|
|
"learning_rate": 8.078404806983096e-06,
|
||
|
|
"loss": 0.3812,
|
||
|
|
"mean_token_accuracy": 0.8642371194437146,
|
||
|
|
"num_tokens": 191505327.0,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417510986328125,
|
||
|
|
"epoch": 1.7598425196850394,
|
||
|
|
"grad_norm": 0.7424818686985509,
|
||
|
|
"learning_rate": 8.035780689044381e-06,
|
||
|
|
"loss": 0.3866,
|
||
|
|
"mean_token_accuracy": 0.8627121299505234,
|
||
|
|
"num_tokens": 191944808.0,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420654296875,
|
||
|
|
"epoch": 1.763779527559055,
|
||
|
|
"grad_norm": 0.6958968245355566,
|
||
|
|
"learning_rate": 7.993193657314874e-06,
|
||
|
|
"loss": 0.3908,
|
||
|
|
"mean_token_accuracy": 0.8628187980502844,
|
||
|
|
"num_tokens": 192357839.0,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42572021484375,
|
||
|
|
"epoch": 1.7677165354330708,
|
||
|
|
"grad_norm": 0.6651030427108865,
|
||
|
|
"learning_rate": 7.95064451587565e-06,
|
||
|
|
"loss": 0.4052,
|
||
|
|
"mean_token_accuracy": 0.8572182497009635,
|
||
|
|
"num_tokens": 192784432.0,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41754150390625,
|
||
|
|
"epoch": 1.7716535433070866,
|
||
|
|
"grad_norm": 0.626078997321363,
|
||
|
|
"learning_rate": 7.908134068092375e-06,
|
||
|
|
"loss": 0.3913,
|
||
|
|
"mean_token_accuracy": 0.8627305366098881,
|
||
|
|
"num_tokens": 193232872.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42828369140625,
|
||
|
|
"epoch": 1.7755905511811023,
|
||
|
|
"grad_norm": 0.6801797942178992,
|
||
|
|
"learning_rate": 7.865663116600149e-06,
|
||
|
|
"loss": 0.3999,
|
||
|
|
"mean_token_accuracy": 0.859979891218245,
|
||
|
|
"num_tokens": 193673419.0,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42181396484375,
|
||
|
|
"epoch": 1.779527559055118,
|
||
|
|
"grad_norm": 0.6282853116668798,
|
||
|
|
"learning_rate": 7.823232463288344e-06,
|
||
|
|
"loss": 0.384,
|
||
|
|
"mean_token_accuracy": 0.8639516020193696,
|
||
|
|
"num_tokens": 194112394.0,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418121337890625,
|
||
|
|
"epoch": 1.7834645669291338,
|
||
|
|
"grad_norm": 0.6796178904809435,
|
||
|
|
"learning_rate": 7.780842909285471e-06,
|
||
|
|
"loss": 0.4193,
|
||
|
|
"mean_token_accuracy": 0.8537283595651388,
|
||
|
|
"num_tokens": 194562565.0,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42059326171875,
|
||
|
|
"epoch": 1.7874015748031495,
|
||
|
|
"grad_norm": 0.704596185547038,
|
||
|
|
"learning_rate": 7.738495254944042e-06,
|
||
|
|
"loss": 0.3904,
|
||
|
|
"mean_token_accuracy": 0.8616365287452936,
|
||
|
|
"num_tokens": 195006242.0,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42645263671875,
|
||
|
|
"epoch": 1.7913385826771653,
|
||
|
|
"grad_norm": 0.660325371978726,
|
||
|
|
"learning_rate": 7.696190299825474e-06,
|
||
|
|
"loss": 0.4005,
|
||
|
|
"mean_token_accuracy": 0.8596520023420453,
|
||
|
|
"num_tokens": 195428816.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423370361328125,
|
||
|
|
"epoch": 1.795275590551181,
|
||
|
|
"grad_norm": 0.6709634013713723,
|
||
|
|
"learning_rate": 7.65392884268498e-06,
|
||
|
|
"loss": 0.3912,
|
||
|
|
"mean_token_accuracy": 0.8635533200576901,
|
||
|
|
"num_tokens": 195853603.0,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426971435546875,
|
||
|
|
"epoch": 1.7992125984251968,
|
||
|
|
"grad_norm": 0.6881407153331328,
|
||
|
|
"learning_rate": 7.611711681456493e-06,
|
||
|
|
"loss": 0.401,
|
||
|
|
"mean_token_accuracy": 0.8594867596402764,
|
||
|
|
"num_tokens": 196275977.0,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420867919921875,
|
||
|
|
"epoch": 1.8031496062992125,
|
||
|
|
"grad_norm": 0.6417147807851576,
|
||
|
|
"learning_rate": 7.569539613237595e-06,
|
||
|
|
"loss": 0.3954,
|
||
|
|
"mean_token_accuracy": 0.8610730767250061,
|
||
|
|
"num_tokens": 196713889.0,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418243408203125,
|
||
|
|
"epoch": 1.8070866141732282,
|
||
|
|
"grad_norm": 0.6310110212469618,
|
||
|
|
"learning_rate": 7.527413434274487e-06,
|
||
|
|
"loss": 0.3885,
|
||
|
|
"mean_token_accuracy": 0.8633995288982987,
|
||
|
|
"num_tokens": 197144027.0,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.417816162109375,
|
||
|
|
"epoch": 1.811023622047244,
|
||
|
|
"grad_norm": 0.6597677933522972,
|
||
|
|
"learning_rate": 7.485333939946926e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"mean_token_accuracy": 0.8605340076610446,
|
||
|
|
"num_tokens": 197580520.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4188232421875,
|
||
|
|
"epoch": 1.8149606299212597,
|
||
|
|
"grad_norm": 0.6723363401244932,
|
||
|
|
"learning_rate": 7.443301924753224e-06,
|
||
|
|
"loss": 0.3993,
|
||
|
|
"mean_token_accuracy": 0.8572817407548428,
|
||
|
|
"num_tokens": 198014473.0,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41595458984375,
|
||
|
|
"epoch": 1.8188976377952755,
|
||
|
|
"grad_norm": 0.6173635578239941,
|
||
|
|
"learning_rate": 7.4013181822952484e-06,
|
||
|
|
"loss": 0.3894,
|
||
|
|
"mean_token_accuracy": 0.8615096509456635,
|
||
|
|
"num_tokens": 198455804.0,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41363525390625,
|
||
|
|
"epoch": 1.8228346456692912,
|
||
|
|
"grad_norm": 0.6661812853397687,
|
||
|
|
"learning_rate": 7.359383505263431e-06,
|
||
|
|
"loss": 0.3856,
|
||
|
|
"mean_token_accuracy": 0.8620446948334575,
|
||
|
|
"num_tokens": 198870929.0,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4166259765625,
|
||
|
|
"epoch": 1.826771653543307,
|
||
|
|
"grad_norm": 0.6574118930371434,
|
||
|
|
"learning_rate": 7.317498685421803e-06,
|
||
|
|
"loss": 0.3879,
|
||
|
|
"mean_token_accuracy": 0.8604755392298102,
|
||
|
|
"num_tokens": 199305535.0,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4122314453125,
|
||
|
|
"epoch": 1.8307086614173227,
|
||
|
|
"grad_norm": 0.6434036995854752,
|
||
|
|
"learning_rate": 7.275664513593057e-06,
|
||
|
|
"loss": 0.3836,
|
||
|
|
"mean_token_accuracy": 0.8634475152939558,
|
||
|
|
"num_tokens": 199750969.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4150390625,
|
||
|
|
"epoch": 1.8346456692913384,
|
||
|
|
"grad_norm": 0.6764928500122582,
|
||
|
|
"learning_rate": 7.233881779643595e-06,
|
||
|
|
"loss": 0.3916,
|
||
|
|
"mean_token_accuracy": 0.863021994009614,
|
||
|
|
"num_tokens": 200195670.0,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4140625,
|
||
|
|
"epoch": 1.8385826771653542,
|
||
|
|
"grad_norm": 0.9197242588519385,
|
||
|
|
"learning_rate": 7.19215127246863e-06,
|
||
|
|
"loss": 0.3788,
|
||
|
|
"mean_token_accuracy": 0.8645974956452847,
|
||
|
|
"num_tokens": 200634020.0,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420623779296875,
|
||
|
|
"epoch": 1.84251968503937,
|
||
|
|
"grad_norm": 0.683308798654025,
|
||
|
|
"learning_rate": 7.150473779977292e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"mean_token_accuracy": 0.8626933787018061,
|
||
|
|
"num_tokens": 201068695.0,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420379638671875,
|
||
|
|
"epoch": 1.8464566929133859,
|
||
|
|
"grad_norm": 0.6859535560379857,
|
||
|
|
"learning_rate": 7.108850089077736e-06,
|
||
|
|
"loss": 0.3938,
|
||
|
|
"mean_token_accuracy": 0.8602238912135363,
|
||
|
|
"num_tokens": 201508090.0,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4178466796875,
|
||
|
|
"epoch": 1.8503937007874016,
|
||
|
|
"grad_norm": 0.6740037040573118,
|
||
|
|
"learning_rate": 7.0672809856623036e-06,
|
||
|
|
"loss": 0.3792,
|
||
|
|
"mean_token_accuracy": 0.8625660231336951,
|
||
|
|
"num_tokens": 201919084.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41619873046875,
|
||
|
|
"epoch": 1.8543307086614174,
|
||
|
|
"grad_norm": 0.6767452030032078,
|
||
|
|
"learning_rate": 7.0257672545926755e-06,
|
||
|
|
"loss": 0.3829,
|
||
|
|
"mean_token_accuracy": 0.864514097571373,
|
||
|
|
"num_tokens": 202352097.0,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4151611328125,
|
||
|
|
"epoch": 1.858267716535433,
|
||
|
|
"grad_norm": 0.6681058032124264,
|
||
|
|
"learning_rate": 6.984309679685049e-06,
|
||
|
|
"loss": 0.3896,
|
||
|
|
"mean_token_accuracy": 0.8648245232179761,
|
||
|
|
"num_tokens": 202784053.0,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420684814453125,
|
||
|
|
"epoch": 1.8622047244094488,
|
||
|
|
"grad_norm": 0.6281109991828322,
|
||
|
|
"learning_rate": 6.942909043695345e-06,
|
||
|
|
"loss": 0.394,
|
||
|
|
"mean_token_accuracy": 0.8599920589476824,
|
||
|
|
"num_tokens": 203218387.0,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420074462890625,
|
||
|
|
"epoch": 1.8661417322834646,
|
||
|
|
"grad_norm": 0.6650717181718757,
|
||
|
|
"learning_rate": 6.901566128304429e-06,
|
||
|
|
"loss": 0.3949,
|
||
|
|
"mean_token_accuracy": 0.8606854053214192,
|
||
|
|
"num_tokens": 203648613.0,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41729736328125,
|
||
|
|
"epoch": 1.8700787401574803,
|
||
|
|
"grad_norm": 0.688270984124111,
|
||
|
|
"learning_rate": 6.86028171410335e-06,
|
||
|
|
"loss": 0.3977,
|
||
|
|
"mean_token_accuracy": 0.8606042871251702,
|
||
|
|
"num_tokens": 204085760.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424652099609375,
|
||
|
|
"epoch": 1.874015748031496,
|
||
|
|
"grad_norm": 0.66428003722815,
|
||
|
|
"learning_rate": 6.8190565805785965e-06,
|
||
|
|
"loss": 0.3819,
|
||
|
|
"mean_token_accuracy": 0.8656391901895404,
|
||
|
|
"num_tokens": 204510920.0,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42144775390625,
|
||
|
|
"epoch": 1.8779527559055118,
|
||
|
|
"grad_norm": 0.7095703761600634,
|
||
|
|
"learning_rate": 6.777891506097394e-06,
|
||
|
|
"loss": 0.3817,
|
||
|
|
"mean_token_accuracy": 0.8649838771671057,
|
||
|
|
"num_tokens": 204928475.0,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41656494140625,
|
||
|
|
"epoch": 1.8818897637795275,
|
||
|
|
"grad_norm": 0.6557237079195142,
|
||
|
|
"learning_rate": 6.736787267892991e-06,
|
||
|
|
"loss": 0.3752,
|
||
|
|
"mean_token_accuracy": 0.8673708308488131,
|
||
|
|
"num_tokens": 205375452.0,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427093505859375,
|
||
|
|
"epoch": 1.8858267716535433,
|
||
|
|
"grad_norm": 0.6954548155651175,
|
||
|
|
"learning_rate": 6.695744642050001e-06,
|
||
|
|
"loss": 0.3928,
|
||
|
|
"mean_token_accuracy": 0.8597937086597085,
|
||
|
|
"num_tokens": 205791665.0,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418548583984375,
|
||
|
|
"epoch": 1.889763779527559,
|
||
|
|
"grad_norm": 0.6341218489395974,
|
||
|
|
"learning_rate": 6.654764403489737e-06,
|
||
|
|
"loss": 0.3775,
|
||
|
|
"mean_token_accuracy": 0.8657321650534868,
|
||
|
|
"num_tokens": 206213381.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.424468994140625,
|
||
|
|
"epoch": 1.8937007874015748,
|
||
|
|
"grad_norm": 0.6614311846483276,
|
||
|
|
"learning_rate": 6.613847325955578e-06,
|
||
|
|
"loss": 0.3786,
|
||
|
|
"mean_token_accuracy": 0.8649399066343904,
|
||
|
|
"num_tokens": 206621419.0,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4228515625,
|
||
|
|
"epoch": 1.8976377952755905,
|
||
|
|
"grad_norm": 0.6827200000056922,
|
||
|
|
"learning_rate": 6.572994181998385e-06,
|
||
|
|
"loss": 0.3867,
|
||
|
|
"mean_token_accuracy": 0.8616899996995926,
|
||
|
|
"num_tokens": 207051076.0,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418701171875,
|
||
|
|
"epoch": 1.9015748031496063,
|
||
|
|
"grad_norm": 0.6787415604952725,
|
||
|
|
"learning_rate": 6.532205742961881e-06,
|
||
|
|
"loss": 0.3903,
|
||
|
|
"mean_token_accuracy": 0.8626525811851025,
|
||
|
|
"num_tokens": 207493089.0,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.428619384765625,
|
||
|
|
"epoch": 1.905511811023622,
|
||
|
|
"grad_norm": 0.7016721452472603,
|
||
|
|
"learning_rate": 6.491482778968103e-06,
|
||
|
|
"loss": 0.3934,
|
||
|
|
"mean_token_accuracy": 0.8631287338212132,
|
||
|
|
"num_tokens": 207907472.0,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4241943359375,
|
||
|
|
"epoch": 1.909448818897638,
|
||
|
|
"grad_norm": 0.7185424042860358,
|
||
|
|
"learning_rate": 6.450826058902868e-06,
|
||
|
|
"loss": 0.4029,
|
||
|
|
"mean_token_accuracy": 0.858769909478724,
|
||
|
|
"num_tokens": 208333643.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.426910400390625,
|
||
|
|
"epoch": 1.9133858267716537,
|
||
|
|
"grad_norm": 0.6673224326835845,
|
||
|
|
"learning_rate": 6.41023635040124e-06,
|
||
|
|
"loss": 0.3878,
|
||
|
|
"mean_token_accuracy": 0.862129864282906,
|
||
|
|
"num_tokens": 208770435.0,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.430267333984375,
|
||
|
|
"epoch": 1.9173228346456694,
|
||
|
|
"grad_norm": 0.6437703200019689,
|
||
|
|
"learning_rate": 6.369714419833056e-06,
|
||
|
|
"loss": 0.3834,
|
||
|
|
"mean_token_accuracy": 0.8656902518123388,
|
||
|
|
"num_tokens": 209205852.0,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420867919921875,
|
||
|
|
"epoch": 1.9212598425196852,
|
||
|
|
"grad_norm": 0.6072676743596053,
|
||
|
|
"learning_rate": 6.3292610322884365e-06,
|
||
|
|
"loss": 0.3792,
|
||
|
|
"mean_token_accuracy": 0.8643869431689382,
|
||
|
|
"num_tokens": 209663707.0,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.429290771484375,
|
||
|
|
"epoch": 1.925196850393701,
|
||
|
|
"grad_norm": 0.6705397286490944,
|
||
|
|
"learning_rate": 6.288876951563352e-06,
|
||
|
|
"loss": 0.3654,
|
||
|
|
"mean_token_accuracy": 0.868832329288125,
|
||
|
|
"num_tokens": 210064008.0,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.428009033203125,
|
||
|
|
"epoch": 1.9291338582677167,
|
||
|
|
"grad_norm": 0.6636918090501086,
|
||
|
|
"learning_rate": 6.2485629401451954e-06,
|
||
|
|
"loss": 0.3944,
|
||
|
|
"mean_token_accuracy": 0.8630169397220016,
|
||
|
|
"num_tokens": 210492962.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.42431640625,
|
||
|
|
"epoch": 1.9330708661417324,
|
||
|
|
"grad_norm": 0.6519084693087212,
|
||
|
|
"learning_rate": 6.2083197591983935e-06,
|
||
|
|
"loss": 0.3829,
|
||
|
|
"mean_token_accuracy": 0.8642466831952333,
|
||
|
|
"num_tokens": 210923592.0,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.413330078125,
|
||
|
|
"epoch": 1.9370078740157481,
|
||
|
|
"grad_norm": 0.61204996961206,
|
||
|
|
"learning_rate": 6.168148168550029e-06,
|
||
|
|
"loss": 0.3808,
|
||
|
|
"mean_token_accuracy": 0.8643651902675629,
|
||
|
|
"num_tokens": 211364636.0,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4154052734375,
|
||
|
|
"epoch": 1.9409448818897639,
|
||
|
|
"grad_norm": 0.636254485190738,
|
||
|
|
"learning_rate": 6.128048926675494e-06,
|
||
|
|
"loss": 0.3759,
|
||
|
|
"mean_token_accuracy": 0.8656032215803862,
|
||
|
|
"num_tokens": 211799550.0,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41650390625,
|
||
|
|
"epoch": 1.9448818897637796,
|
||
|
|
"grad_norm": 0.6406331468098087,
|
||
|
|
"learning_rate": 6.088022790684174e-06,
|
||
|
|
"loss": 0.3794,
|
||
|
|
"mean_token_accuracy": 0.8653112007305026,
|
||
|
|
"num_tokens": 212222704.0,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.418792724609375,
|
||
|
|
"epoch": 1.9488188976377954,
|
||
|
|
"grad_norm": 0.6499997573957319,
|
||
|
|
"learning_rate": 6.048070516305147e-06,
|
||
|
|
"loss": 0.3799,
|
||
|
|
"mean_token_accuracy": 0.86427709646523,
|
||
|
|
"num_tokens": 212669140.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.425079345703125,
|
||
|
|
"epoch": 1.952755905511811,
|
||
|
|
"grad_norm": 0.665145279400102,
|
||
|
|
"learning_rate": 6.0081928578729235e-06,
|
||
|
|
"loss": 0.3802,
|
||
|
|
"mean_token_accuracy": 0.8652894785627723,
|
||
|
|
"num_tokens": 213102653.0,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4298095703125,
|
||
|
|
"epoch": 1.9566929133858268,
|
||
|
|
"grad_norm": 0.6567115782965014,
|
||
|
|
"learning_rate": 5.968390568313194e-06,
|
||
|
|
"loss": 0.4041,
|
||
|
|
"mean_token_accuracy": 0.8583439188078046,
|
||
|
|
"num_tokens": 213537439.0,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.421966552734375,
|
||
|
|
"epoch": 1.9606299212598426,
|
||
|
|
"grad_norm": 0.6356796950899889,
|
||
|
|
"learning_rate": 5.928664399128618e-06,
|
||
|
|
"loss": 0.3956,
|
||
|
|
"mean_token_accuracy": 0.8614333514124155,
|
||
|
|
"num_tokens": 213977409.0,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.427032470703125,
|
||
|
|
"epoch": 1.9645669291338583,
|
||
|
|
"grad_norm": 0.6258173971122529,
|
||
|
|
"learning_rate": 5.889015100384636e-06,
|
||
|
|
"loss": 0.3927,
|
||
|
|
"mean_token_accuracy": 0.8633041819557548,
|
||
|
|
"num_tokens": 214409581.0,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.419891357421875,
|
||
|
|
"epoch": 1.968503937007874,
|
||
|
|
"grad_norm": 0.6365710396494279,
|
||
|
|
"learning_rate": 5.8494434206953054e-06,
|
||
|
|
"loss": 0.3745,
|
||
|
|
"mean_token_accuracy": 0.8675504606217146,
|
||
|
|
"num_tokens": 214842294.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.419921875,
|
||
|
|
"epoch": 1.9724409448818898,
|
||
|
|
"grad_norm": 0.673936395220552,
|
||
|
|
"learning_rate": 5.809950107209168e-06,
|
||
|
|
"loss": 0.3825,
|
||
|
|
"mean_token_accuracy": 0.863592054694891,
|
||
|
|
"num_tokens": 215283386.0,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.416656494140625,
|
||
|
|
"epoch": 1.9763779527559056,
|
||
|
|
"grad_norm": 0.6782512956847635,
|
||
|
|
"learning_rate": 5.770535905595138e-06,
|
||
|
|
"loss": 0.39,
|
||
|
|
"mean_token_accuracy": 0.8610916286706924,
|
||
|
|
"num_tokens": 215720079.0,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.423675537109375,
|
||
|
|
"epoch": 1.9803149606299213,
|
||
|
|
"grad_norm": 0.6940032370900541,
|
||
|
|
"learning_rate": 5.731201560028432e-06,
|
||
|
|
"loss": 0.3809,
|
||
|
|
"mean_token_accuracy": 0.8642973145470023,
|
||
|
|
"num_tokens": 216131160.0,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422760009765625,
|
||
|
|
"epoch": 1.984251968503937,
|
||
|
|
"grad_norm": 0.6566881027504372,
|
||
|
|
"learning_rate": 5.6919478131765075e-06,
|
||
|
|
"loss": 0.3771,
|
||
|
|
"mean_token_accuracy": 0.8662382122129202,
|
||
|
|
"num_tokens": 216557614.0,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4281005859375,
|
||
|
|
"epoch": 1.9881889763779528,
|
||
|
|
"grad_norm": 0.6871982977212773,
|
||
|
|
"learning_rate": 5.652775406185056e-06,
|
||
|
|
"loss": 0.3787,
|
||
|
|
"mean_token_accuracy": 0.8649186259135604,
|
||
|
|
"num_tokens": 216973484.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.420867919921875,
|
||
|
|
"epoch": 1.9921259842519685,
|
||
|
|
"grad_norm": 0.6620807044087577,
|
||
|
|
"learning_rate": 5.613685078663993e-06,
|
||
|
|
"loss": 0.3779,
|
||
|
|
"mean_token_accuracy": 0.8658335618674755,
|
||
|
|
"num_tokens": 217399571.0,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.422088623046875,
|
||
|
|
"epoch": 1.9960629921259843,
|
||
|
|
"grad_norm": 0.6447316252113497,
|
||
|
|
"learning_rate": 5.574677568673499e-06,
|
||
|
|
"loss": 0.375,
|
||
|
|
"mean_token_accuracy": 0.8671787939965725,
|
||
|
|
"num_tokens": 217820936.0,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41986083984375,
|
||
|
|
"epoch": 2.0,
|
||
|
|
"grad_norm": 0.6671106585235214,
|
||
|
|
"learning_rate": 5.535753612710091e-06,
|
||
|
|
"loss": 0.3747,
|
||
|
|
"mean_token_accuracy": 0.866083949804306,
|
||
|
|
"num_tokens": 218248519.0,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4205322265625,
|
||
|
|
"epoch": 2.0039370078740157,
|
||
|
|
"grad_norm": 0.7691060576967338,
|
||
|
|
"learning_rate": 5.496913945692706e-06,
|
||
|
|
"loss": 0.3357,
|
||
|
|
"mean_token_accuracy": 0.8782863048836589,
|
||
|
|
"num_tokens": 218681722.0,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.41546630859375,
|
||
|
|
"epoch": 2.0078740157480315,
|
||
|
|
"grad_norm": 0.706074532223459,
|
||
|
|
"learning_rate": 5.458159300948837e-06,
|
||
|
|
"loss": 0.3355,
|
||
|
|
"mean_token_accuracy": 0.8784531345590949,
|
||
|
|
"num_tokens": 219097671.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40863037109375,
|
||
|
|
"epoch": 2.0118110236220472,
|
||
|
|
"grad_norm": 0.6827314808658607,
|
||
|
|
"learning_rate": 5.419490410200675e-06,
|
||
|
|
"loss": 0.3501,
|
||
|
|
"mean_token_accuracy": 0.8720254069194198,
|
||
|
|
"num_tokens": 219548387.0,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399017333984375,
|
||
|
|
"epoch": 2.015748031496063,
|
||
|
|
"grad_norm": 0.7517593980159016,
|
||
|
|
"learning_rate": 5.3809080035513e-06,
|
||
|
|
"loss": 0.3217,
|
||
|
|
"mean_token_accuracy": 0.883345877751708,
|
||
|
|
"num_tokens": 219990743.0,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40924072265625,
|
||
|
|
"epoch": 2.0196850393700787,
|
||
|
|
"grad_norm": 0.9325438750744767,
|
||
|
|
"learning_rate": 5.342412809470903e-06,
|
||
|
|
"loss": 0.3426,
|
||
|
|
"mean_token_accuracy": 0.87769855838269,
|
||
|
|
"num_tokens": 220414422.0,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.403961181640625,
|
||
|
|
"epoch": 2.0236220472440944,
|
||
|
|
"grad_norm": 0.6686083056431692,
|
||
|
|
"learning_rate": 5.304005554783015e-06,
|
||
|
|
"loss": 0.3353,
|
||
|
|
"mean_token_accuracy": 0.8792219227179885,
|
||
|
|
"num_tokens": 220847772.0,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4119873046875,
|
||
|
|
"epoch": 2.02755905511811,
|
||
|
|
"grad_norm": 0.6959836173942446,
|
||
|
|
"learning_rate": 5.265686964650796e-06,
|
||
|
|
"loss": 0.3315,
|
||
|
|
"mean_token_accuracy": 0.8809208925813437,
|
||
|
|
"num_tokens": 221279092.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4127197265625,
|
||
|
|
"epoch": 2.031496062992126,
|
||
|
|
"grad_norm": 0.705326284320785,
|
||
|
|
"learning_rate": 5.227457762563339e-06,
|
||
|
|
"loss": 0.3372,
|
||
|
|
"mean_token_accuracy": 0.8781589884310961,
|
||
|
|
"num_tokens": 221698923.0,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415802001953125,
|
||
|
|
"epoch": 2.0354330708661417,
|
||
|
|
"grad_norm": 0.6949441787042704,
|
||
|
|
"learning_rate": 5.189318670322016e-06,
|
||
|
|
"loss": 0.3398,
|
||
|
|
"mean_token_accuracy": 0.8793862201273441,
|
||
|
|
"num_tokens": 222134184.0,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.408905029296875,
|
||
|
|
"epoch": 2.0393700787401574,
|
||
|
|
"grad_norm": 0.7195006498702371,
|
||
|
|
"learning_rate": 5.151270408026839e-06,
|
||
|
|
"loss": 0.3281,
|
||
|
|
"mean_token_accuracy": 0.8829669477418065,
|
||
|
|
"num_tokens": 222571682.0,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.415283203125,
|
||
|
|
"epoch": 2.043307086614173,
|
||
|
|
"grad_norm": 0.6442176858506412,
|
||
|
|
"learning_rate": 5.113313694062869e-06,
|
||
|
|
"loss": 0.3345,
|
||
|
|
"mean_token_accuracy": 0.8803455736488104,
|
||
|
|
"num_tokens": 223006861.0,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40582275390625,
|
||
|
|
"epoch": 2.047244094488189,
|
||
|
|
"grad_norm": 0.7512390521563385,
|
||
|
|
"learning_rate": 5.075449245086661e-06,
|
||
|
|
"loss": 0.3332,
|
||
|
|
"mean_token_accuracy": 0.8804942537099123,
|
||
|
|
"num_tokens": 223430085.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40545654296875,
|
||
|
|
"epoch": 2.0511811023622046,
|
||
|
|
"grad_norm": 0.7818713463029238,
|
||
|
|
"learning_rate": 5.037677776012719e-06,
|
||
|
|
"loss": 0.3502,
|
||
|
|
"mean_token_accuracy": 0.8745955023914576,
|
||
|
|
"num_tokens": 223863341.0,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402984619140625,
|
||
|
|
"epoch": 2.0551181102362204,
|
||
|
|
"grad_norm": 0.6654412815987092,
|
||
|
|
"learning_rate": 5.000000000000003e-06,
|
||
|
|
"loss": 0.353,
|
||
|
|
"mean_token_accuracy": 0.8740874016657472,
|
||
|
|
"num_tokens": 224309404.0,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40625,
|
||
|
|
"epoch": 2.059055118110236,
|
||
|
|
"grad_norm": 0.6620816958634905,
|
||
|
|
"learning_rate": 4.962416628438466e-06,
|
||
|
|
"loss": 0.3322,
|
||
|
|
"mean_token_accuracy": 0.8814442995935678,
|
||
|
|
"num_tokens": 224731123.0,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.404052734375,
|
||
|
|
"epoch": 2.062992125984252,
|
||
|
|
"grad_norm": 0.6684098815971495,
|
||
|
|
"learning_rate": 4.924928370935622e-06,
|
||
|
|
"loss": 0.3352,
|
||
|
|
"mean_token_accuracy": 0.8791022077202797,
|
||
|
|
"num_tokens": 225161374.0,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.412261962890625,
|
||
|
|
"epoch": 2.0669291338582676,
|
||
|
|
"grad_norm": 0.6915973611592388,
|
||
|
|
"learning_rate": 4.887535935303147e-06,
|
||
|
|
"loss": 0.3306,
|
||
|
|
"mean_token_accuracy": 0.880140382796526,
|
||
|
|
"num_tokens": 225592845.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40618896484375,
|
||
|
|
"epoch": 2.0708661417322833,
|
||
|
|
"grad_norm": 0.6917073506681685,
|
||
|
|
"learning_rate": 4.850240027543509e-06,
|
||
|
|
"loss": 0.3411,
|
||
|
|
"mean_token_accuracy": 0.8773935958743095,
|
||
|
|
"num_tokens": 226031171.0,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.407501220703125,
|
||
|
|
"epoch": 2.074803149606299,
|
||
|
|
"grad_norm": 0.6590869362543731,
|
||
|
|
"learning_rate": 4.813041351836657e-06,
|
||
|
|
"loss": 0.3253,
|
||
|
|
"mean_token_accuracy": 0.8814301686361432,
|
||
|
|
"num_tokens": 226484203.0,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4034423828125,
|
||
|
|
"epoch": 2.078740157480315,
|
||
|
|
"grad_norm": 0.660774132093501,
|
||
|
|
"learning_rate": 4.775940610526698e-06,
|
||
|
|
"loss": 0.3371,
|
||
|
|
"mean_token_accuracy": 0.8811690313741565,
|
||
|
|
"num_tokens": 226923813.0,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4007568359375,
|
||
|
|
"epoch": 2.0826771653543306,
|
||
|
|
"grad_norm": 0.7267964307643108,
|
||
|
|
"learning_rate": 4.738938504108659e-06,
|
||
|
|
"loss": 0.3287,
|
||
|
|
"mean_token_accuracy": 0.8822143021970987,
|
||
|
|
"num_tokens": 227364235.0,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.401336669921875,
|
||
|
|
"epoch": 2.0866141732283463,
|
||
|
|
"grad_norm": 0.7011620090348606,
|
||
|
|
"learning_rate": 4.702035731215249e-06,
|
||
|
|
"loss": 0.3245,
|
||
|
|
"mean_token_accuracy": 0.8842730978503823,
|
||
|
|
"num_tokens": 227804681.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399322509765625,
|
||
|
|
"epoch": 2.090551181102362,
|
||
|
|
"grad_norm": 0.6994922281758611,
|
||
|
|
"learning_rate": 4.665232988603671e-06,
|
||
|
|
"loss": 0.3346,
|
||
|
|
"mean_token_accuracy": 0.8798423083499074,
|
||
|
|
"num_tokens": 228233941.0,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.408050537109375,
|
||
|
|
"epoch": 2.094488188976378,
|
||
|
|
"grad_norm": 0.659969290721089,
|
||
|
|
"learning_rate": 4.6285309711424706e-06,
|
||
|
|
"loss": 0.3282,
|
||
|
|
"mean_token_accuracy": 0.8802306912839413,
|
||
|
|
"num_tokens": 228671574.0,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40264892578125,
|
||
|
|
"epoch": 2.0984251968503935,
|
||
|
|
"grad_norm": 0.7160845199984103,
|
||
|
|
"learning_rate": 4.59193037179841e-06,
|
||
|
|
"loss": 0.334,
|
||
|
|
"mean_token_accuracy": 0.8797960076481104,
|
||
|
|
"num_tokens": 229097422.0,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.410980224609375,
|
||
|
|
"epoch": 2.1023622047244093,
|
||
|
|
"grad_norm": 0.6651663210919708,
|
||
|
|
"learning_rate": 4.555431881623384e-06,
|
||
|
|
"loss": 0.3324,
|
||
|
|
"mean_token_accuracy": 0.8817898659035563,
|
||
|
|
"num_tokens": 229532488.0,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4046630859375,
|
||
|
|
"epoch": 2.106299212598425,
|
||
|
|
"grad_norm": 0.6719211249811485,
|
||
|
|
"learning_rate": 4.519036189741386e-06,
|
||
|
|
"loss": 0.3378,
|
||
|
|
"mean_token_accuracy": 0.8793261991813779,
|
||
|
|
"num_tokens": 229955370.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4071044921875,
|
||
|
|
"epoch": 2.1102362204724407,
|
||
|
|
"grad_norm": 0.683765412515,
|
||
|
|
"learning_rate": 4.482743983335478e-06,
|
||
|
|
"loss": 0.3203,
|
||
|
|
"mean_token_accuracy": 0.883794778957963,
|
||
|
|
"num_tokens": 230366605.0,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.403594970703125,
|
||
|
|
"epoch": 2.1141732283464565,
|
||
|
|
"grad_norm": 0.6938692767492354,
|
||
|
|
"learning_rate": 4.446555947634825e-06,
|
||
|
|
"loss": 0.3329,
|
||
|
|
"mean_token_accuracy": 0.8809925802052021,
|
||
|
|
"num_tokens": 230800239.0,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.400390625,
|
||
|
|
"epoch": 2.1181102362204722,
|
||
|
|
"grad_norm": 0.6579903284426181,
|
||
|
|
"learning_rate": 4.410472765901755e-06,
|
||
|
|
"loss": 0.3368,
|
||
|
|
"mean_token_accuracy": 0.8784988336265087,
|
||
|
|
"num_tokens": 231220452.0,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402130126953125,
|
||
|
|
"epoch": 2.122047244094488,
|
||
|
|
"grad_norm": 0.6480009306742889,
|
||
|
|
"learning_rate": 4.3744951194188645e-06,
|
||
|
|
"loss": 0.3318,
|
||
|
|
"mean_token_accuracy": 0.8805544385686517,
|
||
|
|
"num_tokens": 231669259.0,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.404144287109375,
|
||
|
|
"epoch": 2.1259842519685037,
|
||
|
|
"grad_norm": 0.6762874373310868,
|
||
|
|
"learning_rate": 4.3386236874761455e-06,
|
||
|
|
"loss": 0.3307,
|
||
|
|
"mean_token_accuracy": 0.8821213049814105,
|
||
|
|
"num_tokens": 232092788.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.405059814453125,
|
||
|
|
"epoch": 2.1299212598425195,
|
||
|
|
"grad_norm": 0.7007715804079939,
|
||
|
|
"learning_rate": 4.302859147358168e-06,
|
||
|
|
"loss": 0.3285,
|
||
|
|
"mean_token_accuracy": 0.8826727429404855,
|
||
|
|
"num_tokens": 232515170.0,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402435302734375,
|
||
|
|
"epoch": 2.1338582677165356,
|
||
|
|
"grad_norm": 0.7058653274548208,
|
||
|
|
"learning_rate": 4.267202174331288e-06,
|
||
|
|
"loss": 0.3293,
|
||
|
|
"mean_token_accuracy": 0.8815928604453802,
|
||
|
|
"num_tokens": 232951104.0,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.401519775390625,
|
||
|
|
"epoch": 2.1377952755905514,
|
||
|
|
"grad_norm": 0.6786820083449475,
|
||
|
|
"learning_rate": 4.231653441630899e-06,
|
||
|
|
"loss": 0.332,
|
||
|
|
"mean_token_accuracy": 0.8781663812696934,
|
||
|
|
"num_tokens": 233357394.0,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40057373046875,
|
||
|
|
"epoch": 2.141732283464567,
|
||
|
|
"grad_norm": 0.6791074299791867,
|
||
|
|
"learning_rate": 4.196213620448724e-06,
|
||
|
|
"loss": 0.3297,
|
||
|
|
"mean_token_accuracy": 0.8811912108212709,
|
||
|
|
"num_tokens": 233777649.0,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4022216796875,
|
||
|
|
"epoch": 2.145669291338583,
|
||
|
|
"grad_norm": 0.6865111051458076,
|
||
|
|
"learning_rate": 4.160883379920132e-06,
|
||
|
|
"loss": 0.3254,
|
||
|
|
"mean_token_accuracy": 0.8841365138068795,
|
||
|
|
"num_tokens": 234210947.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40753173828125,
|
||
|
|
"epoch": 2.1496062992125986,
|
||
|
|
"grad_norm": 0.6603880166281437,
|
||
|
|
"learning_rate": 4.125663387111519e-06,
|
||
|
|
"loss": 0.3354,
|
||
|
|
"mean_token_accuracy": 0.8794803349301219,
|
||
|
|
"num_tokens": 234642824.0,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402801513671875,
|
||
|
|
"epoch": 2.1535433070866143,
|
||
|
|
"grad_norm": 0.6910095210213717,
|
||
|
|
"learning_rate": 4.0905543070077036e-06,
|
||
|
|
"loss": 0.3411,
|
||
|
|
"mean_token_accuracy": 0.8780597625300288,
|
||
|
|
"num_tokens": 235081344.0,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40283203125,
|
||
|
|
"epoch": 2.15748031496063,
|
||
|
|
"grad_norm": 0.6916532852158944,
|
||
|
|
"learning_rate": 4.055556802499373e-06,
|
||
|
|
"loss": 0.3245,
|
||
|
|
"mean_token_accuracy": 0.8826562752947211,
|
||
|
|
"num_tokens": 235519773.0,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.403900146484375,
|
||
|
|
"epoch": 2.161417322834646,
|
||
|
|
"grad_norm": 0.6605629307385316,
|
||
|
|
"learning_rate": 4.020671534370566e-06,
|
||
|
|
"loss": 0.3392,
|
||
|
|
"mean_token_accuracy": 0.8770075533539057,
|
||
|
|
"num_tokens": 235952995.0,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40240478515625,
|
||
|
|
"epoch": 2.1653543307086616,
|
||
|
|
"grad_norm": 0.6954491968470974,
|
||
|
|
"learning_rate": 3.985899161286205e-06,
|
||
|
|
"loss": 0.3411,
|
||
|
|
"mean_token_accuracy": 0.8809783374890685,
|
||
|
|
"num_tokens": 236377384.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.400390625,
|
||
|
|
"epoch": 2.1692913385826773,
|
||
|
|
"grad_norm": 0.6955945501474788,
|
||
|
|
"learning_rate": 3.951240339779649e-06,
|
||
|
|
"loss": 0.3383,
|
||
|
|
"mean_token_accuracy": 0.8795864386484027,
|
||
|
|
"num_tokens": 236805272.0,
|
||
|
|
"step": 551
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399261474609375,
|
||
|
|
"epoch": 2.173228346456693,
|
||
|
|
"grad_norm": 0.6811865815204354,
|
||
|
|
"learning_rate": 3.916695724240302e-06,
|
||
|
|
"loss": 0.3243,
|
||
|
|
"mean_token_accuracy": 0.8818806270137429,
|
||
|
|
"num_tokens": 237240034.0,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.404266357421875,
|
||
|
|
"epoch": 2.177165354330709,
|
||
|
|
"grad_norm": 0.6604827060686905,
|
||
|
|
"learning_rate": 3.882265966901257e-06,
|
||
|
|
"loss": 0.3338,
|
||
|
|
"mean_token_accuracy": 0.8813010770827532,
|
||
|
|
"num_tokens": 237676815.0,
|
||
|
|
"step": 553
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397491455078125,
|
||
|
|
"epoch": 2.1811023622047245,
|
||
|
|
"grad_norm": 0.6951568747277557,
|
||
|
|
"learning_rate": 3.847951717826984e-06,
|
||
|
|
"loss": 0.3196,
|
||
|
|
"mean_token_accuracy": 0.8818888068199158,
|
||
|
|
"num_tokens": 238121169.0,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3974609375,
|
||
|
|
"epoch": 2.1850393700787403,
|
||
|
|
"grad_norm": 0.6604458548896505,
|
||
|
|
"learning_rate": 3.813753624901053e-06,
|
||
|
|
"loss": 0.3235,
|
||
|
|
"mean_token_accuracy": 0.883362052962184,
|
||
|
|
"num_tokens": 238560402.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399688720703125,
|
||
|
|
"epoch": 2.188976377952756,
|
||
|
|
"grad_norm": 0.6595354267544808,
|
||
|
|
"learning_rate": 3.7796723338138995e-06,
|
||
|
|
"loss": 0.3425,
|
||
|
|
"mean_token_accuracy": 0.876252400688827,
|
||
|
|
"num_tokens": 238996471.0,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39691162109375,
|
||
|
|
"epoch": 2.1929133858267718,
|
||
|
|
"grad_norm": 0.6640704940514063,
|
||
|
|
"learning_rate": 3.7457084880506465e-06,
|
||
|
|
"loss": 0.3284,
|
||
|
|
"mean_token_accuracy": 0.8791890293359756,
|
||
|
|
"num_tokens": 239431088.0,
|
||
|
|
"step": 557
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402496337890625,
|
||
|
|
"epoch": 2.1968503937007875,
|
||
|
|
"grad_norm": 0.6267558305460987,
|
||
|
|
"learning_rate": 3.7118627288789355e-06,
|
||
|
|
"loss": 0.3274,
|
||
|
|
"mean_token_accuracy": 0.8825696604326367,
|
||
|
|
"num_tokens": 239851102.0,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39837646484375,
|
||
|
|
"epoch": 2.2007874015748032,
|
||
|
|
"grad_norm": 0.650661471745278,
|
||
|
|
"learning_rate": 3.6781356953368286e-06,
|
||
|
|
"loss": 0.3253,
|
||
|
|
"mean_token_accuracy": 0.8800821900367737,
|
||
|
|
"num_tokens": 240286779.0,
|
||
|
|
"step": 559
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4083251953125,
|
||
|
|
"epoch": 2.204724409448819,
|
||
|
|
"grad_norm": 0.6676395521640903,
|
||
|
|
"learning_rate": 3.644528024220745e-06,
|
||
|
|
"loss": 0.352,
|
||
|
|
"mean_token_accuracy": 0.8772099521011114,
|
||
|
|
"num_tokens": 240715293.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4019775390625,
|
||
|
|
"epoch": 2.2086614173228347,
|
||
|
|
"grad_norm": 0.6661145894616234,
|
||
|
|
"learning_rate": 3.6110403500734325e-06,
|
||
|
|
"loss": 0.3277,
|
||
|
|
"mean_token_accuracy": 0.8814953323453665,
|
||
|
|
"num_tokens": 241151984.0,
|
||
|
|
"step": 561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4093017578125,
|
||
|
|
"epoch": 2.2125984251968505,
|
||
|
|
"grad_norm": 0.6267579335510625,
|
||
|
|
"learning_rate": 3.5776733051719935e-06,
|
||
|
|
"loss": 0.3276,
|
||
|
|
"mean_token_accuracy": 0.882298044860363,
|
||
|
|
"num_tokens": 241582173.0,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396942138671875,
|
||
|
|
"epoch": 2.216535433070866,
|
||
|
|
"grad_norm": 1.2833446633947432,
|
||
|
|
"learning_rate": 3.5444275195159395e-06,
|
||
|
|
"loss": 0.3259,
|
||
|
|
"mean_token_accuracy": 0.8836051663383842,
|
||
|
|
"num_tokens": 242022323.0,
|
||
|
|
"step": 563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397369384765625,
|
||
|
|
"epoch": 2.220472440944882,
|
||
|
|
"grad_norm": 0.6483374974300078,
|
||
|
|
"learning_rate": 3.5113036208152994e-06,
|
||
|
|
"loss": 0.3138,
|
||
|
|
"mean_token_accuracy": 0.8874066807329655,
|
||
|
|
"num_tokens": 242449211.0,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39886474609375,
|
||
|
|
"epoch": 2.2244094488188977,
|
||
|
|
"grad_norm": 0.7449795107940003,
|
||
|
|
"learning_rate": 3.4783022344787698e-06,
|
||
|
|
"loss": 0.3416,
|
||
|
|
"mean_token_accuracy": 0.8797894669696689,
|
||
|
|
"num_tokens": 242868913.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39984130859375,
|
||
|
|
"epoch": 2.2283464566929134,
|
||
|
|
"grad_norm": 0.7384068020075348,
|
||
|
|
"learning_rate": 3.4454239836019032e-06,
|
||
|
|
"loss": 0.335,
|
||
|
|
"mean_token_accuracy": 0.8808618625625968,
|
||
|
|
"num_tokens": 243288176.0,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40191650390625,
|
||
|
|
"epoch": 2.232283464566929,
|
||
|
|
"grad_norm": 0.6138346261513192,
|
||
|
|
"learning_rate": 3.412669488955346e-06,
|
||
|
|
"loss": 0.3265,
|
||
|
|
"mean_token_accuracy": 0.8829402485862374,
|
||
|
|
"num_tokens": 243712004.0,
|
||
|
|
"step": 567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398529052734375,
|
||
|
|
"epoch": 2.236220472440945,
|
||
|
|
"grad_norm": 0.6826323150688026,
|
||
|
|
"learning_rate": 3.380039368973115e-06,
|
||
|
|
"loss": 0.3525,
|
||
|
|
"mean_token_accuracy": 0.8730159010738134,
|
||
|
|
"num_tokens": 244151217.0,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39910888671875,
|
||
|
|
"epoch": 2.2401574803149606,
|
||
|
|
"grad_norm": 0.7731434019517666,
|
||
|
|
"learning_rate": 3.347534239740925e-06,
|
||
|
|
"loss": 0.3299,
|
||
|
|
"mean_token_accuracy": 0.8819707138463855,
|
||
|
|
"num_tokens": 244570025.0,
|
||
|
|
"step": 569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40252685546875,
|
||
|
|
"epoch": 2.2440944881889764,
|
||
|
|
"grad_norm": 0.7958821930394692,
|
||
|
|
"learning_rate": 3.315154714984554e-06,
|
||
|
|
"loss": 0.3311,
|
||
|
|
"mean_token_accuracy": 0.8825913481414318,
|
||
|
|
"num_tokens": 244995470.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395721435546875,
|
||
|
|
"epoch": 2.248031496062992,
|
||
|
|
"grad_norm": 0.6803195610061324,
|
||
|
|
"learning_rate": 3.2829014060582498e-06,
|
||
|
|
"loss": 0.3174,
|
||
|
|
"mean_token_accuracy": 0.8865452529862523,
|
||
|
|
"num_tokens": 245441326.0,
|
||
|
|
"step": 571
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396636962890625,
|
||
|
|
"epoch": 2.251968503937008,
|
||
|
|
"grad_norm": 0.6508860212504988,
|
||
|
|
"learning_rate": 3.2507749219332065e-06,
|
||
|
|
"loss": 0.3249,
|
||
|
|
"mean_token_accuracy": 0.8850123547017574,
|
||
|
|
"num_tokens": 245869452.0,
|
||
|
|
"step": 572
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39361572265625,
|
||
|
|
"epoch": 2.2559055118110236,
|
||
|
|
"grad_norm": 0.6814325606200489,
|
||
|
|
"learning_rate": 3.218775869186038e-06,
|
||
|
|
"loss": 0.32,
|
||
|
|
"mean_token_accuracy": 0.8850808152928948,
|
||
|
|
"num_tokens": 246283765.0,
|
||
|
|
"step": 573
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390350341796875,
|
||
|
|
"epoch": 2.2598425196850394,
|
||
|
|
"grad_norm": 0.7249147176255266,
|
||
|
|
"learning_rate": 3.1869048519873514e-06,
|
||
|
|
"loss": 0.3319,
|
||
|
|
"mean_token_accuracy": 0.882703147828579,
|
||
|
|
"num_tokens": 246712103.0,
|
||
|
|
"step": 574
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397247314453125,
|
||
|
|
"epoch": 2.263779527559055,
|
||
|
|
"grad_norm": 0.7259919964209407,
|
||
|
|
"learning_rate": 3.1551624720903197e-06,
|
||
|
|
"loss": 0.3278,
|
||
|
|
"mean_token_accuracy": 0.8822907945141196,
|
||
|
|
"num_tokens": 247128803.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.408172607421875,
|
||
|
|
"epoch": 2.267716535433071,
|
||
|
|
"grad_norm": 0.6430937463791561,
|
||
|
|
"learning_rate": 3.1235493288193363e-06,
|
||
|
|
"loss": 0.3154,
|
||
|
|
"mean_token_accuracy": 0.8839439991861582,
|
||
|
|
"num_tokens": 247530472.0,
|
||
|
|
"step": 576
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3909912109375,
|
||
|
|
"epoch": 2.2716535433070866,
|
||
|
|
"grad_norm": 0.6538968560952247,
|
||
|
|
"learning_rate": 3.0920660190586893e-06,
|
||
|
|
"loss": 0.3227,
|
||
|
|
"mean_token_accuracy": 0.8826503995805979,
|
||
|
|
"num_tokens": 247965594.0,
|
||
|
|
"step": 577
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.392974853515625,
|
||
|
|
"epoch": 2.2755905511811023,
|
||
|
|
"grad_norm": 0.65502612474774,
|
||
|
|
"learning_rate": 3.0607131372412903e-06,
|
||
|
|
"loss": 0.3342,
|
||
|
|
"mean_token_accuracy": 0.8804387804120779,
|
||
|
|
"num_tokens": 248401499.0,
|
||
|
|
"step": 578
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40069580078125,
|
||
|
|
"epoch": 2.279527559055118,
|
||
|
|
"grad_norm": 0.6717303153956555,
|
||
|
|
"learning_rate": 3.029491275337466e-06,
|
||
|
|
"loss": 0.3267,
|
||
|
|
"mean_token_accuracy": 0.8830945594236255,
|
||
|
|
"num_tokens": 248833275.0,
|
||
|
|
"step": 579
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391693115234375,
|
||
|
|
"epoch": 2.283464566929134,
|
||
|
|
"grad_norm": 0.662496244283556,
|
||
|
|
"learning_rate": 2.998401022843761e-06,
|
||
|
|
"loss": 0.3285,
|
||
|
|
"mean_token_accuracy": 0.8807523930445313,
|
||
|
|
"num_tokens": 249261595.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3966064453125,
|
||
|
|
"epoch": 2.2874015748031495,
|
||
|
|
"grad_norm": 0.6439773446458666,
|
||
|
|
"learning_rate": 2.9674429667718198e-06,
|
||
|
|
"loss": 0.3233,
|
||
|
|
"mean_token_accuracy": 0.8819929100573063,
|
||
|
|
"num_tokens": 249679398.0,
|
||
|
|
"step": 581
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394989013671875,
|
||
|
|
"epoch": 2.2913385826771653,
|
||
|
|
"grad_norm": 0.618547629691431,
|
||
|
|
"learning_rate": 2.9366176916373024e-06,
|
||
|
|
"loss": 0.3257,
|
||
|
|
"mean_token_accuracy": 0.8828444425016642,
|
||
|
|
"num_tokens": 250123653.0,
|
||
|
|
"step": 582
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395904541015625,
|
||
|
|
"epoch": 2.295275590551181,
|
||
|
|
"grad_norm": 0.6734411095539774,
|
||
|
|
"learning_rate": 2.9059257794488428e-06,
|
||
|
|
"loss": 0.3253,
|
||
|
|
"mean_token_accuracy": 0.8810829911381006,
|
||
|
|
"num_tokens": 250553262.0,
|
||
|
|
"step": 583
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395111083984375,
|
||
|
|
"epoch": 2.2992125984251968,
|
||
|
|
"grad_norm": 0.6609096857306654,
|
||
|
|
"learning_rate": 2.875367809697067e-06,
|
||
|
|
"loss": 0.3247,
|
||
|
|
"mean_token_accuracy": 0.8829443035647273,
|
||
|
|
"num_tokens": 250963697.0,
|
||
|
|
"step": 584
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394805908203125,
|
||
|
|
"epoch": 2.3031496062992125,
|
||
|
|
"grad_norm": 0.6843839300325607,
|
||
|
|
"learning_rate": 2.84494435934365e-06,
|
||
|
|
"loss": 0.3174,
|
||
|
|
"mean_token_accuracy": 0.8860497623682022,
|
||
|
|
"num_tokens": 251383717.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4027099609375,
|
||
|
|
"epoch": 2.3070866141732282,
|
||
|
|
"grad_norm": 0.693475193681948,
|
||
|
|
"learning_rate": 2.8146560028104155e-06,
|
||
|
|
"loss": 0.3268,
|
||
|
|
"mean_token_accuracy": 0.8848175024613738,
|
||
|
|
"num_tokens": 251809123.0,
|
||
|
|
"step": 586
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399444580078125,
|
||
|
|
"epoch": 2.311023622047244,
|
||
|
|
"grad_norm": 0.6482014364527828,
|
||
|
|
"learning_rate": 2.7845033119684996e-06,
|
||
|
|
"loss": 0.3249,
|
||
|
|
"mean_token_accuracy": 0.8843581713736057,
|
||
|
|
"num_tokens": 252232897.0,
|
||
|
|
"step": 587
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394683837890625,
|
||
|
|
"epoch": 2.3149606299212597,
|
||
|
|
"grad_norm": 0.6511044909303425,
|
||
|
|
"learning_rate": 2.7544868561275473e-06,
|
||
|
|
"loss": 0.3033,
|
||
|
|
"mean_token_accuracy": 0.8875886900350451,
|
||
|
|
"num_tokens": 252652937.0,
|
||
|
|
"step": 588
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393707275390625,
|
||
|
|
"epoch": 2.3188976377952755,
|
||
|
|
"grad_norm": 0.6526243088549442,
|
||
|
|
"learning_rate": 2.724607202024969e-06,
|
||
|
|
"loss": 0.3151,
|
||
|
|
"mean_token_accuracy": 0.884684244170785,
|
||
|
|
"num_tokens": 253092659.0,
|
||
|
|
"step": 589
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.400543212890625,
|
||
|
|
"epoch": 2.322834645669291,
|
||
|
|
"grad_norm": 0.7074127197129519,
|
||
|
|
"learning_rate": 2.694864913815234e-06,
|
||
|
|
"loss": 0.3318,
|
||
|
|
"mean_token_accuracy": 0.8794507039710879,
|
||
|
|
"num_tokens": 253513268.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3931884765625,
|
||
|
|
"epoch": 2.326771653543307,
|
||
|
|
"grad_norm": 0.6708725885175661,
|
||
|
|
"learning_rate": 2.665260553059219e-06,
|
||
|
|
"loss": 0.3079,
|
||
|
|
"mean_token_accuracy": 0.8883746191859245,
|
||
|
|
"num_tokens": 253932423.0,
|
||
|
|
"step": 591
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391082763671875,
|
||
|
|
"epoch": 2.3307086614173227,
|
||
|
|
"grad_norm": 0.642752601240437,
|
||
|
|
"learning_rate": 2.635794678713611e-06,
|
||
|
|
"loss": 0.3335,
|
||
|
|
"mean_token_accuracy": 0.8798313392326236,
|
||
|
|
"num_tokens": 254366200.0,
|
||
|
|
"step": 592
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39373779296875,
|
||
|
|
"epoch": 2.3346456692913384,
|
||
|
|
"grad_norm": 0.632491220317453,
|
||
|
|
"learning_rate": 2.6064678471203497e-06,
|
||
|
|
"loss": 0.3197,
|
||
|
|
"mean_token_accuracy": 0.8848578063771129,
|
||
|
|
"num_tokens": 254777165.0,
|
||
|
|
"step": 593
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391265869140625,
|
||
|
|
"epoch": 2.338582677165354,
|
||
|
|
"grad_norm": 0.6657812420395008,
|
||
|
|
"learning_rate": 2.5772806119961204e-06,
|
||
|
|
"loss": 0.3273,
|
||
|
|
"mean_token_accuracy": 0.8817225815728307,
|
||
|
|
"num_tokens": 255230273.0,
|
||
|
|
"step": 594
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393798828125,
|
||
|
|
"epoch": 2.34251968503937,
|
||
|
|
"grad_norm": 0.6631893071514698,
|
||
|
|
"learning_rate": 2.5482335244219114e-06,
|
||
|
|
"loss": 0.3415,
|
||
|
|
"mean_token_accuracy": 0.8777058375999331,
|
||
|
|
"num_tokens": 255677101.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394866943359375,
|
||
|
|
"epoch": 2.3464566929133857,
|
||
|
|
"grad_norm": 0.6801590026511766,
|
||
|
|
"learning_rate": 2.519327132832592e-06,
|
||
|
|
"loss": 0.3023,
|
||
|
|
"mean_token_accuracy": 0.8904533553868532,
|
||
|
|
"num_tokens": 256095773.0,
|
||
|
|
"step": 596
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40203857421875,
|
||
|
|
"epoch": 2.3503937007874014,
|
||
|
|
"grad_norm": 0.6545771849436208,
|
||
|
|
"learning_rate": 2.4905619830065685e-06,
|
||
|
|
"loss": 0.3276,
|
||
|
|
"mean_token_accuracy": 0.8830998111516237,
|
||
|
|
"num_tokens": 256527168.0,
|
||
|
|
"step": 597
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398193359375,
|
||
|
|
"epoch": 2.354330708661417,
|
||
|
|
"grad_norm": 0.639726060039768,
|
||
|
|
"learning_rate": 2.4619386180554783e-06,
|
||
|
|
"loss": 0.3177,
|
||
|
|
"mean_token_accuracy": 0.8855596333742142,
|
||
|
|
"num_tokens": 256941879.0,
|
||
|
|
"step": 598
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391021728515625,
|
||
|
|
"epoch": 2.358267716535433,
|
||
|
|
"grad_norm": 0.673255368473798,
|
||
|
|
"learning_rate": 2.4334575784139324e-06,
|
||
|
|
"loss": 0.3216,
|
||
|
|
"mean_token_accuracy": 0.8818031437695026,
|
||
|
|
"num_tokens": 257380979.0,
|
||
|
|
"step": 599
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3917236328125,
|
||
|
|
"epoch": 2.362204724409449,
|
||
|
|
"grad_norm": 0.6712526931241498,
|
||
|
|
"learning_rate": 2.405119401829312e-06,
|
||
|
|
"loss": 0.331,
|
||
|
|
"mean_token_accuracy": 0.8820068500936031,
|
||
|
|
"num_tokens": 257816910.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395477294921875,
|
||
|
|
"epoch": 2.366141732283465,
|
||
|
|
"grad_norm": 0.6286157138570574,
|
||
|
|
"learning_rate": 2.3769246233516243e-06,
|
||
|
|
"loss": 0.3185,
|
||
|
|
"mean_token_accuracy": 0.8859440181404352,
|
||
|
|
"num_tokens": 258256027.0,
|
||
|
|
"step": 601
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398040771484375,
|
||
|
|
"epoch": 2.3700787401574805,
|
||
|
|
"grad_norm": 0.6470378030744797,
|
||
|
|
"learning_rate": 2.3488737753233827e-06,
|
||
|
|
"loss": 0.3177,
|
||
|
|
"mean_token_accuracy": 0.883867921307683,
|
||
|
|
"num_tokens": 258673460.0,
|
||
|
|
"step": 602
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39007568359375,
|
||
|
|
"epoch": 2.3740157480314963,
|
||
|
|
"grad_norm": 0.6228299960277243,
|
||
|
|
"learning_rate": 2.3209673873695705e-06,
|
||
|
|
"loss": 0.3264,
|
||
|
|
"mean_token_accuracy": 0.8804994663223624,
|
||
|
|
"num_tokens": 259111848.0,
|
||
|
|
"step": 603
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394622802734375,
|
||
|
|
"epoch": 2.377952755905512,
|
||
|
|
"grad_norm": 0.679880113137842,
|
||
|
|
"learning_rate": 2.2932059863876364e-06,
|
||
|
|
"loss": 0.3189,
|
||
|
|
"mean_token_accuracy": 0.8832629825919867,
|
||
|
|
"num_tokens": 259542914.0,
|
||
|
|
"step": 604
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389068603515625,
|
||
|
|
"epoch": 2.3818897637795278,
|
||
|
|
"grad_norm": 0.6413194426701236,
|
||
|
|
"learning_rate": 2.2655900965375454e-06,
|
||
|
|
"loss": 0.3221,
|
||
|
|
"mean_token_accuracy": 0.8834966970607638,
|
||
|
|
"num_tokens": 259974950.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39306640625,
|
||
|
|
"epoch": 2.3858267716535435,
|
||
|
|
"grad_norm": 0.647333297597132,
|
||
|
|
"learning_rate": 2.2381202392318813e-06,
|
||
|
|
"loss": 0.3255,
|
||
|
|
"mean_token_accuracy": 0.8841407634317875,
|
||
|
|
"num_tokens": 260391029.0,
|
||
|
|
"step": 606
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389190673828125,
|
||
|
|
"epoch": 2.3897637795275593,
|
||
|
|
"grad_norm": 0.6237674488982197,
|
||
|
|
"learning_rate": 2.210796933126005e-06,
|
||
|
|
"loss": 0.3207,
|
||
|
|
"mean_token_accuracy": 0.8847579173743725,
|
||
|
|
"num_tokens": 260813537.0,
|
||
|
|
"step": 607
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399444580078125,
|
||
|
|
"epoch": 2.393700787401575,
|
||
|
|
"grad_norm": 0.6641198284295734,
|
||
|
|
"learning_rate": 2.1836206941082593e-06,
|
||
|
|
"loss": 0.3236,
|
||
|
|
"mean_token_accuracy": 0.884418660774827,
|
||
|
|
"num_tokens": 261224513.0,
|
||
|
|
"step": 608
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39337158203125,
|
||
|
|
"epoch": 2.3976377952755907,
|
||
|
|
"grad_norm": 0.6284008254683265,
|
||
|
|
"learning_rate": 2.1565920352902327e-06,
|
||
|
|
"loss": 0.316,
|
||
|
|
"mean_token_accuracy": 0.8880952065810561,
|
||
|
|
"num_tokens": 261640630.0,
|
||
|
|
"step": 609
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391754150390625,
|
||
|
|
"epoch": 2.4015748031496065,
|
||
|
|
"grad_norm": 0.63953172691108,
|
||
|
|
"learning_rate": 2.129711466997062e-06,
|
||
|
|
"loss": 0.3103,
|
||
|
|
"mean_token_accuracy": 0.8891152497380972,
|
||
|
|
"num_tokens": 262069836.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39288330078125,
|
||
|
|
"epoch": 2.405511811023622,
|
||
|
|
"grad_norm": 0.64156503400504,
|
||
|
|
"learning_rate": 2.10297949675781e-06,
|
||
|
|
"loss": 0.3337,
|
||
|
|
"mean_token_accuracy": 0.8809280870482326,
|
||
|
|
"num_tokens": 262485062.0,
|
||
|
|
"step": 611
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391937255859375,
|
||
|
|
"epoch": 2.409448818897638,
|
||
|
|
"grad_norm": 0.7503194408517748,
|
||
|
|
"learning_rate": 2.0763966292958704e-06,
|
||
|
|
"loss": 0.3377,
|
||
|
|
"mean_token_accuracy": 0.8801953559741378,
|
||
|
|
"num_tokens": 262914987.0,
|
||
|
|
"step": 612
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39837646484375,
|
||
|
|
"epoch": 2.4133858267716537,
|
||
|
|
"grad_norm": 0.6110565618144053,
|
||
|
|
"learning_rate": 2.049963366519446e-06,
|
||
|
|
"loss": 0.3221,
|
||
|
|
"mean_token_accuracy": 0.8836126467213035,
|
||
|
|
"num_tokens": 263333219.0,
|
||
|
|
"step": 613
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402984619140625,
|
||
|
|
"epoch": 2.4173228346456694,
|
||
|
|
"grad_norm": 0.6475195761761684,
|
||
|
|
"learning_rate": 2.023680207512071e-06,
|
||
|
|
"loss": 0.3216,
|
||
|
|
"mean_token_accuracy": 0.8832600386813283,
|
||
|
|
"num_tokens": 263740746.0,
|
||
|
|
"step": 614
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394622802734375,
|
||
|
|
"epoch": 2.421259842519685,
|
||
|
|
"grad_norm": 0.6209714529513158,
|
||
|
|
"learning_rate": 1.9975476485231847e-06,
|
||
|
|
"loss": 0.3309,
|
||
|
|
"mean_token_accuracy": 0.8813778571784496,
|
||
|
|
"num_tokens": 264164051.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395050048828125,
|
||
|
|
"epoch": 2.425196850393701,
|
||
|
|
"grad_norm": 0.6604985399806831,
|
||
|
|
"learning_rate": 1.9715661829587653e-06,
|
||
|
|
"loss": 0.3246,
|
||
|
|
"mean_token_accuracy": 0.8825316475704312,
|
||
|
|
"num_tokens": 264589410.0,
|
||
|
|
"step": 616
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394378662109375,
|
||
|
|
"epoch": 2.4291338582677167,
|
||
|
|
"grad_norm": 0.64135319275765,
|
||
|
|
"learning_rate": 1.94573630137201e-06,
|
||
|
|
"loss": 0.3202,
|
||
|
|
"mean_token_accuracy": 0.8855673084035516,
|
||
|
|
"num_tokens": 265016663.0,
|
||
|
|
"step": 617
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394073486328125,
|
||
|
|
"epoch": 2.4330708661417324,
|
||
|
|
"grad_norm": 0.6549755012645107,
|
||
|
|
"learning_rate": 1.9200584914540833e-06,
|
||
|
|
"loss": 0.3233,
|
||
|
|
"mean_token_accuracy": 0.8843048512935638,
|
||
|
|
"num_tokens": 265448816.0,
|
||
|
|
"step": 618
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39239501953125,
|
||
|
|
"epoch": 2.437007874015748,
|
||
|
|
"grad_norm": 0.6460597097434501,
|
||
|
|
"learning_rate": 1.8945332380248914e-06,
|
||
|
|
"loss": 0.3162,
|
||
|
|
"mean_token_accuracy": 0.8841556925326586,
|
||
|
|
"num_tokens": 265877226.0,
|
||
|
|
"step": 619
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39520263671875,
|
||
|
|
"epoch": 2.440944881889764,
|
||
|
|
"grad_norm": 0.6172518945931613,
|
||
|
|
"learning_rate": 1.8691610230239443e-06,
|
||
|
|
"loss": 0.3289,
|
||
|
|
"mean_token_accuracy": 0.883539610542357,
|
||
|
|
"num_tokens": 266327100.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39593505859375,
|
||
|
|
"epoch": 2.4448818897637796,
|
||
|
|
"grad_norm": 0.6661038350327722,
|
||
|
|
"learning_rate": 1.8439423255012478e-06,
|
||
|
|
"loss": 0.3242,
|
||
|
|
"mean_token_accuracy": 0.8828055150806904,
|
||
|
|
"num_tokens": 266753411.0,
|
||
|
|
"step": 621
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391571044921875,
|
||
|
|
"epoch": 2.4488188976377954,
|
||
|
|
"grad_norm": 0.6803435522040893,
|
||
|
|
"learning_rate": 1.8188776216082604e-06,
|
||
|
|
"loss": 0.3167,
|
||
|
|
"mean_token_accuracy": 0.8865473745390773,
|
||
|
|
"num_tokens": 267178072.0,
|
||
|
|
"step": 622
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394989013671875,
|
||
|
|
"epoch": 2.452755905511811,
|
||
|
|
"grad_norm": 0.6808244304054752,
|
||
|
|
"learning_rate": 1.7939673845889072e-06,
|
||
|
|
"loss": 0.3271,
|
||
|
|
"mean_token_accuracy": 0.8820376275107265,
|
||
|
|
"num_tokens": 267599228.0,
|
||
|
|
"step": 623
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39794921875,
|
||
|
|
"epoch": 2.456692913385827,
|
||
|
|
"grad_norm": 0.6497891086437605,
|
||
|
|
"learning_rate": 1.7692120847706396e-06,
|
||
|
|
"loss": 0.3193,
|
||
|
|
"mean_token_accuracy": 0.8842871803790331,
|
||
|
|
"num_tokens": 268031239.0,
|
||
|
|
"step": 624
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395477294921875,
|
||
|
|
"epoch": 2.4606299212598426,
|
||
|
|
"grad_norm": 0.6905683476666384,
|
||
|
|
"learning_rate": 1.7446121895555556e-06,
|
||
|
|
"loss": 0.319,
|
||
|
|
"mean_token_accuracy": 0.8847667053341866,
|
||
|
|
"num_tokens": 268447599.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395751953125,
|
||
|
|
"epoch": 2.4645669291338583,
|
||
|
|
"grad_norm": 0.6265658635480155,
|
||
|
|
"learning_rate": 1.7201681634115753e-06,
|
||
|
|
"loss": 0.3156,
|
||
|
|
"mean_token_accuracy": 0.8880419284105301,
|
||
|
|
"num_tokens": 268881017.0,
|
||
|
|
"step": 626
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396240234375,
|
||
|
|
"epoch": 2.468503937007874,
|
||
|
|
"grad_norm": 0.6399557345959777,
|
||
|
|
"learning_rate": 1.6958804678636743e-06,
|
||
|
|
"loss": 0.3179,
|
||
|
|
"mean_token_accuracy": 0.8842396680265665,
|
||
|
|
"num_tokens": 269311901.0,
|
||
|
|
"step": 627
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39349365234375,
|
||
|
|
"epoch": 2.47244094488189,
|
||
|
|
"grad_norm": 0.6593703467640568,
|
||
|
|
"learning_rate": 1.6717495614851654e-06,
|
||
|
|
"loss": 0.3247,
|
||
|
|
"mean_token_accuracy": 0.883893528021872,
|
||
|
|
"num_tokens": 269753510.0,
|
||
|
|
"step": 628
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39080810546875,
|
||
|
|
"epoch": 2.4763779527559056,
|
||
|
|
"grad_norm": 0.6767235781794113,
|
||
|
|
"learning_rate": 1.6477758998890448e-06,
|
||
|
|
"loss": 0.3234,
|
||
|
|
"mean_token_accuracy": 0.8831140054389834,
|
||
|
|
"num_tokens": 270180847.0,
|
||
|
|
"step": 629
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390716552734375,
|
||
|
|
"epoch": 2.4803149606299213,
|
||
|
|
"grad_norm": 0.628556080122311,
|
||
|
|
"learning_rate": 1.6239599357193837e-06,
|
||
|
|
"loss": 0.3222,
|
||
|
|
"mean_token_accuracy": 0.8835263950750232,
|
||
|
|
"num_tokens": 270614343.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389892578125,
|
||
|
|
"epoch": 2.484251968503937,
|
||
|
|
"grad_norm": 0.6910689957499726,
|
||
|
|
"learning_rate": 1.6003021186427892e-06,
|
||
|
|
"loss": 0.3109,
|
||
|
|
"mean_token_accuracy": 0.8867247756570578,
|
||
|
|
"num_tokens": 271063084.0,
|
||
|
|
"step": 631
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3984375,
|
||
|
|
"epoch": 2.4881889763779528,
|
||
|
|
"grad_norm": 0.665136959096579,
|
||
|
|
"learning_rate": 1.5768028953399083e-06,
|
||
|
|
"loss": 0.3133,
|
||
|
|
"mean_token_accuracy": 0.8876866241917014,
|
||
|
|
"num_tokens": 271492678.0,
|
||
|
|
"step": 632
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39459228515625,
|
||
|
|
"epoch": 2.4921259842519685,
|
||
|
|
"grad_norm": 0.6786290792116132,
|
||
|
|
"learning_rate": 1.5534627094969957e-06,
|
||
|
|
"loss": 0.3408,
|
||
|
|
"mean_token_accuracy": 0.8770330473780632,
|
||
|
|
"num_tokens": 271927919.0,
|
||
|
|
"step": 633
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40264892578125,
|
||
|
|
"epoch": 2.4960629921259843,
|
||
|
|
"grad_norm": 0.6314184625745893,
|
||
|
|
"learning_rate": 1.5302820017975396e-06,
|
||
|
|
"loss": 0.3266,
|
||
|
|
"mean_token_accuracy": 0.8817340964451432,
|
||
|
|
"num_tokens": 272340839.0,
|
||
|
|
"step": 634
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39813232421875,
|
||
|
|
"epoch": 2.5,
|
||
|
|
"grad_norm": 0.6330687294612445,
|
||
|
|
"learning_rate": 1.5072612099139373e-06,
|
||
|
|
"loss": 0.3285,
|
||
|
|
"mean_token_accuracy": 0.8819912485778332,
|
||
|
|
"num_tokens": 272758612.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39630126953125,
|
||
|
|
"epoch": 2.5039370078740157,
|
||
|
|
"grad_norm": 0.6669821941938795,
|
||
|
|
"learning_rate": 1.4844007684992333e-06,
|
||
|
|
"loss": 0.3138,
|
||
|
|
"mean_token_accuracy": 0.886656578630209,
|
||
|
|
"num_tokens": 273181297.0,
|
||
|
|
"step": 636
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393096923828125,
|
||
|
|
"epoch": 2.5078740157480315,
|
||
|
|
"grad_norm": 0.6345157626325129,
|
||
|
|
"learning_rate": 1.4617011091789135e-06,
|
||
|
|
"loss": 0.3153,
|
||
|
|
"mean_token_accuracy": 0.8863168517127633,
|
||
|
|
"num_tokens": 273608125.0,
|
||
|
|
"step": 637
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40460205078125,
|
||
|
|
"epoch": 2.5118110236220472,
|
||
|
|
"grad_norm": 0.6666402919975754,
|
||
|
|
"learning_rate": 1.4391626605427522e-06,
|
||
|
|
"loss": 0.3112,
|
||
|
|
"mean_token_accuracy": 0.8885293649509549,
|
||
|
|
"num_tokens": 274013088.0,
|
||
|
|
"step": 638
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398956298828125,
|
||
|
|
"epoch": 2.515748031496063,
|
||
|
|
"grad_norm": 0.6605459057250379,
|
||
|
|
"learning_rate": 1.4167858481367237e-06,
|
||
|
|
"loss": 0.3177,
|
||
|
|
"mean_token_accuracy": 0.8849203772842884,
|
||
|
|
"num_tokens": 274443071.0,
|
||
|
|
"step": 639
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39727783203125,
|
||
|
|
"epoch": 2.5196850393700787,
|
||
|
|
"grad_norm": 0.6263187778391787,
|
||
|
|
"learning_rate": 1.3945710944549705e-06,
|
||
|
|
"loss": 0.3252,
|
||
|
|
"mean_token_accuracy": 0.8821171736344695,
|
||
|
|
"num_tokens": 274872156.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39178466796875,
|
||
|
|
"epoch": 2.5236220472440944,
|
||
|
|
"grad_norm": 0.6356426496587686,
|
||
|
|
"learning_rate": 1.3725188189318172e-06,
|
||
|
|
"loss": 0.3124,
|
||
|
|
"mean_token_accuracy": 0.8871043566614389,
|
||
|
|
"num_tokens": 275329376.0,
|
||
|
|
"step": 641
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397705078125,
|
||
|
|
"epoch": 2.52755905511811,
|
||
|
|
"grad_norm": 0.6682552356019673,
|
||
|
|
"learning_rate": 1.3506294379338557e-06,
|
||
|
|
"loss": 0.3337,
|
||
|
|
"mean_token_accuracy": 0.877638204023242,
|
||
|
|
"num_tokens": 275767280.0,
|
||
|
|
"step": 642
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396026611328125,
|
||
|
|
"epoch": 2.531496062992126,
|
||
|
|
"grad_norm": 0.6390620503683951,
|
||
|
|
"learning_rate": 1.3289033647520878e-06,
|
||
|
|
"loss": 0.3122,
|
||
|
|
"mean_token_accuracy": 0.8879886958748102,
|
||
|
|
"num_tokens": 276195274.0,
|
||
|
|
"step": 643
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398406982421875,
|
||
|
|
"epoch": 2.5354330708661417,
|
||
|
|
"grad_norm": 0.6441985779511971,
|
||
|
|
"learning_rate": 1.307341009594113e-06,
|
||
|
|
"loss": 0.3274,
|
||
|
|
"mean_token_accuracy": 0.8814769377931952,
|
||
|
|
"num_tokens": 276629597.0,
|
||
|
|
"step": 644
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39813232421875,
|
||
|
|
"epoch": 2.5393700787401574,
|
||
|
|
"grad_norm": 0.6314415905953961,
|
||
|
|
"learning_rate": 1.2859427795763967e-06,
|
||
|
|
"loss": 0.3109,
|
||
|
|
"mean_token_accuracy": 0.8865178329870105,
|
||
|
|
"num_tokens": 277046245.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39410400390625,
|
||
|
|
"epoch": 2.543307086614173,
|
||
|
|
"grad_norm": 0.6215596209130474,
|
||
|
|
"learning_rate": 1.2647090787165694e-06,
|
||
|
|
"loss": 0.3256,
|
||
|
|
"mean_token_accuracy": 0.8811031272634864,
|
||
|
|
"num_tokens": 277475376.0,
|
||
|
|
"step": 646
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3934326171875,
|
||
|
|
"epoch": 2.547244094488189,
|
||
|
|
"grad_norm": 0.6234194349157417,
|
||
|
|
"learning_rate": 1.2436403079258064e-06,
|
||
|
|
"loss": 0.3094,
|
||
|
|
"mean_token_accuracy": 0.8892135825008154,
|
||
|
|
"num_tokens": 277902729.0,
|
||
|
|
"step": 647
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39385986328125,
|
||
|
|
"epoch": 2.5511811023622046,
|
||
|
|
"grad_norm": 0.7226064894835298,
|
||
|
|
"learning_rate": 1.2227368650012572e-06,
|
||
|
|
"loss": 0.3279,
|
||
|
|
"mean_token_accuracy": 0.8839946733787656,
|
||
|
|
"num_tokens": 278350458.0,
|
||
|
|
"step": 648
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394744873046875,
|
||
|
|
"epoch": 2.5551181102362204,
|
||
|
|
"grad_norm": 0.6298237267921505,
|
||
|
|
"learning_rate": 1.201999144618531e-06,
|
||
|
|
"loss": 0.3196,
|
||
|
|
"mean_token_accuracy": 0.8864294402301311,
|
||
|
|
"num_tokens": 278781796.0,
|
||
|
|
"step": 649
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402099609375,
|
||
|
|
"epoch": 2.559055118110236,
|
||
|
|
"grad_norm": 0.7016612151580089,
|
||
|
|
"learning_rate": 1.1814275383242512e-06,
|
||
|
|
"loss": 0.3317,
|
||
|
|
"mean_token_accuracy": 0.8814294217154384,
|
||
|
|
"num_tokens": 279203350.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396270751953125,
|
||
|
|
"epoch": 2.562992125984252,
|
||
|
|
"grad_norm": 0.6241050703271862,
|
||
|
|
"learning_rate": 1.1610224345286591e-06,
|
||
|
|
"loss": 0.3137,
|
||
|
|
"mean_token_accuracy": 0.8861905531957746,
|
||
|
|
"num_tokens": 279639552.0,
|
||
|
|
"step": 651
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.392730712890625,
|
||
|
|
"epoch": 2.5669291338582676,
|
||
|
|
"grad_norm": 0.659361603190389,
|
||
|
|
"learning_rate": 1.1407842184982786e-06,
|
||
|
|
"loss": 0.3058,
|
||
|
|
"mean_token_accuracy": 0.8890553684905171,
|
||
|
|
"num_tokens": 280082152.0,
|
||
|
|
"step": 652
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393646240234375,
|
||
|
|
"epoch": 2.5708661417322833,
|
||
|
|
"grad_norm": 0.6373452639560493,
|
||
|
|
"learning_rate": 1.1207132723486457e-06,
|
||
|
|
"loss": 0.3193,
|
||
|
|
"mean_token_accuracy": 0.883937232196331,
|
||
|
|
"num_tokens": 280513039.0,
|
||
|
|
"step": 653
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395751953125,
|
||
|
|
"epoch": 2.574803149606299,
|
||
|
|
"grad_norm": 0.6515723371089116,
|
||
|
|
"learning_rate": 1.1008099750370916e-06,
|
||
|
|
"loss": 0.3106,
|
||
|
|
"mean_token_accuracy": 0.8856142768636346,
|
||
|
|
"num_tokens": 280939007.0,
|
||
|
|
"step": 654
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398284912109375,
|
||
|
|
"epoch": 2.578740157480315,
|
||
|
|
"grad_norm": 0.6514931861993788,
|
||
|
|
"learning_rate": 1.0810747023555879e-06,
|
||
|
|
"loss": 0.3011,
|
||
|
|
"mean_token_accuracy": 0.8899843348190188,
|
||
|
|
"num_tokens": 281341467.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.401214599609375,
|
||
|
|
"epoch": 2.5826771653543306,
|
||
|
|
"grad_norm": 0.6165563490279274,
|
||
|
|
"learning_rate": 1.0615078269236512e-06,
|
||
|
|
"loss": 0.3268,
|
||
|
|
"mean_token_accuracy": 0.8844519322738051,
|
||
|
|
"num_tokens": 281763144.0,
|
||
|
|
"step": 656
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3929443359375,
|
||
|
|
"epoch": 2.5866141732283463,
|
||
|
|
"grad_norm": 0.8159003176179213,
|
||
|
|
"learning_rate": 1.04210971818131e-06,
|
||
|
|
"loss": 0.3331,
|
||
|
|
"mean_token_accuracy": 0.8811143329367042,
|
||
|
|
"num_tokens": 282216187.0,
|
||
|
|
"step": 657
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38885498046875,
|
||
|
|
"epoch": 2.590551181102362,
|
||
|
|
"grad_norm": 0.6293311703276512,
|
||
|
|
"learning_rate": 1.0228807423821262e-06,
|
||
|
|
"loss": 0.3184,
|
||
|
|
"mean_token_accuracy": 0.8853057865053415,
|
||
|
|
"num_tokens": 282677398.0,
|
||
|
|
"step": 658
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.40081787109375,
|
||
|
|
"epoch": 2.594488188976378,
|
||
|
|
"grad_norm": 0.6521783698950797,
|
||
|
|
"learning_rate": 1.0038212625862799e-06,
|
||
|
|
"loss": 0.3185,
|
||
|
|
"mean_token_accuracy": 0.8854817440733314,
|
||
|
|
"num_tokens": 283091191.0,
|
||
|
|
"step": 659
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3934326171875,
|
||
|
|
"epoch": 2.5984251968503935,
|
||
|
|
"grad_norm": 0.615351062838791,
|
||
|
|
"learning_rate": 9.84931638653719e-07,
|
||
|
|
"loss": 0.3089,
|
||
|
|
"mean_token_accuracy": 0.8892610957846045,
|
||
|
|
"num_tokens": 283521595.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39398193359375,
|
||
|
|
"epoch": 2.6023622047244093,
|
||
|
|
"grad_norm": 0.6227949884838964,
|
||
|
|
"learning_rate": 9.662122272373574e-07,
|
||
|
|
"loss": 0.316,
|
||
|
|
"mean_token_accuracy": 0.8863938516005874,
|
||
|
|
"num_tokens": 283954903.0,
|
||
|
|
"step": 661
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39398193359375,
|
||
|
|
"epoch": 2.606299212598425,
|
||
|
|
"grad_norm": 0.6245845638981473,
|
||
|
|
"learning_rate": 9.476633817763481e-07,
|
||
|
|
"loss": 0.3312,
|
||
|
|
"mean_token_accuracy": 0.8837857628241181,
|
||
|
|
"num_tokens": 284389477.0,
|
||
|
|
"step": 662
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4013671875,
|
||
|
|
"epoch": 2.6102362204724407,
|
||
|
|
"grad_norm": 0.6468912877856337,
|
||
|
|
"learning_rate": 9.292854524894068e-07,
|
||
|
|
"loss": 0.3064,
|
||
|
|
"mean_token_accuracy": 0.8889474645256996,
|
||
|
|
"num_tokens": 284806724.0,
|
||
|
|
"step": 663
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398712158203125,
|
||
|
|
"epoch": 2.6141732283464565,
|
||
|
|
"grad_norm": 0.6124857634152812,
|
||
|
|
"learning_rate": 9.110787863682002e-07,
|
||
|
|
"loss": 0.3194,
|
||
|
|
"mean_token_accuracy": 0.8831356568261981,
|
||
|
|
"num_tokens": 285235987.0,
|
||
|
|
"step": 664
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393951416015625,
|
||
|
|
"epoch": 2.6181102362204722,
|
||
|
|
"grad_norm": 0.6135551165191099,
|
||
|
|
"learning_rate": 8.930437271707915e-07,
|
||
|
|
"loss": 0.3071,
|
||
|
|
"mean_token_accuracy": 0.8895376035943627,
|
||
|
|
"num_tokens": 285668668.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39849853515625,
|
||
|
|
"epoch": 2.622047244094488,
|
||
|
|
"grad_norm": 0.6279955512566158,
|
||
|
|
"learning_rate": 8.751806154151521e-07,
|
||
|
|
"loss": 0.3096,
|
||
|
|
"mean_token_accuracy": 0.8884470723569393,
|
||
|
|
"num_tokens": 286103078.0,
|
||
|
|
"step": 666
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394317626953125,
|
||
|
|
"epoch": 2.6259842519685037,
|
||
|
|
"grad_norm": 0.738468612337961,
|
||
|
|
"learning_rate": 8.574897883727384e-07,
|
||
|
|
"loss": 0.3257,
|
||
|
|
"mean_token_accuracy": 0.8832857329398394,
|
||
|
|
"num_tokens": 286553130.0,
|
||
|
|
"step": 667
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.392608642578125,
|
||
|
|
"epoch": 2.6299212598425195,
|
||
|
|
"grad_norm": 1.2334069200957607,
|
||
|
|
"learning_rate": 8.399715800621111e-07,
|
||
|
|
"loss": 0.3111,
|
||
|
|
"mean_token_accuracy": 0.8888679994270205,
|
||
|
|
"num_tokens": 286979768.0,
|
||
|
|
"step": 668
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.386383056640625,
|
||
|
|
"epoch": 2.633858267716535,
|
||
|
|
"grad_norm": 0.6221501280148931,
|
||
|
|
"learning_rate": 8.226263212426389e-07,
|
||
|
|
"loss": 0.315,
|
||
|
|
"mean_token_accuracy": 0.8863368751481175,
|
||
|
|
"num_tokens": 287440144.0,
|
||
|
|
"step": 669
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396026611328125,
|
||
|
|
"epoch": 2.637795275590551,
|
||
|
|
"grad_norm": 0.6190885324530173,
|
||
|
|
"learning_rate": 8.054543394082503e-07,
|
||
|
|
"loss": 0.3277,
|
||
|
|
"mean_token_accuracy": 0.8823181875050068,
|
||
|
|
"num_tokens": 287886492.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393524169921875,
|
||
|
|
"epoch": 2.6417322834645667,
|
||
|
|
"grad_norm": 0.7064829350170301,
|
||
|
|
"learning_rate": 7.884559587812501e-07,
|
||
|
|
"loss": 0.317,
|
||
|
|
"mean_token_accuracy": 0.8856177758425474,
|
||
|
|
"num_tokens": 288320175.0,
|
||
|
|
"step": 671
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399383544921875,
|
||
|
|
"epoch": 2.6456692913385824,
|
||
|
|
"grad_norm": 0.7329568438915423,
|
||
|
|
"learning_rate": 7.716315003061948e-07,
|
||
|
|
"loss": 0.3193,
|
||
|
|
"mean_token_accuracy": 0.8855800237506628,
|
||
|
|
"num_tokens": 288747403.0,
|
||
|
|
"step": 672
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39471435546875,
|
||
|
|
"epoch": 2.649606299212598,
|
||
|
|
"grad_norm": 0.61185048591417,
|
||
|
|
"learning_rate": 7.549812816438395e-07,
|
||
|
|
"loss": 0.3117,
|
||
|
|
"mean_token_accuracy": 0.8875846909359097,
|
||
|
|
"num_tokens": 289190749.0,
|
||
|
|
"step": 673
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3912353515625,
|
||
|
|
"epoch": 2.653543307086614,
|
||
|
|
"grad_norm": 0.6087484451984008,
|
||
|
|
"learning_rate": 7.38505617165135e-07,
|
||
|
|
"loss": 0.315,
|
||
|
|
"mean_token_accuracy": 0.88607323076576,
|
||
|
|
"num_tokens": 289624045.0,
|
||
|
|
"step": 674
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.387939453125,
|
||
|
|
"epoch": 2.65748031496063,
|
||
|
|
"grad_norm": 0.6318153118085255,
|
||
|
|
"learning_rate": 7.222048179452945e-07,
|
||
|
|
"loss": 0.3175,
|
||
|
|
"mean_token_accuracy": 0.8839446315541863,
|
||
|
|
"num_tokens": 290070600.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390716552734375,
|
||
|
|
"epoch": 2.661417322834646,
|
||
|
|
"grad_norm": 0.6310528037803923,
|
||
|
|
"learning_rate": 7.06079191757918e-07,
|
||
|
|
"loss": 0.316,
|
||
|
|
"mean_token_accuracy": 0.8856059042736888,
|
||
|
|
"num_tokens": 290513687.0,
|
||
|
|
"step": 676
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395843505859375,
|
||
|
|
"epoch": 2.6653543307086616,
|
||
|
|
"grad_norm": 0.6093719666352239,
|
||
|
|
"learning_rate": 6.901290430691842e-07,
|
||
|
|
"loss": 0.3252,
|
||
|
|
"mean_token_accuracy": 0.8829275881871581,
|
||
|
|
"num_tokens": 290928435.0,
|
||
|
|
"step": 677
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3968505859375,
|
||
|
|
"epoch": 2.6692913385826773,
|
||
|
|
"grad_norm": 0.6849993862489645,
|
||
|
|
"learning_rate": 6.743546730320993e-07,
|
||
|
|
"loss": 0.3281,
|
||
|
|
"mean_token_accuracy": 0.8848955575376749,
|
||
|
|
"num_tokens": 291364474.0,
|
||
|
|
"step": 678
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389373779296875,
|
||
|
|
"epoch": 2.673228346456693,
|
||
|
|
"grad_norm": 0.6129024908856187,
|
||
|
|
"learning_rate": 6.587563794808127e-07,
|
||
|
|
"loss": 0.3157,
|
||
|
|
"mean_token_accuracy": 0.8874608399346471,
|
||
|
|
"num_tokens": 291802575.0,
|
||
|
|
"step": 679
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39959716796875,
|
||
|
|
"epoch": 2.677165354330709,
|
||
|
|
"grad_norm": 0.6279482452089803,
|
||
|
|
"learning_rate": 6.433344569249922e-07,
|
||
|
|
"loss": 0.312,
|
||
|
|
"mean_token_accuracy": 0.8877925118431449,
|
||
|
|
"num_tokens": 292217867.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394195556640625,
|
||
|
|
"epoch": 2.6811023622047245,
|
||
|
|
"grad_norm": 0.618547629928698,
|
||
|
|
"learning_rate": 6.280891965442648e-07,
|
||
|
|
"loss": 0.3039,
|
||
|
|
"mean_token_accuracy": 0.8895618692040443,
|
||
|
|
"num_tokens": 292638162.0,
|
||
|
|
"step": 681
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.392333984375,
|
||
|
|
"epoch": 2.6850393700787403,
|
||
|
|
"grad_norm": 0.609097120858831,
|
||
|
|
"learning_rate": 6.130208861827203e-07,
|
||
|
|
"loss": 0.3213,
|
||
|
|
"mean_token_accuracy": 0.8848212473094463,
|
||
|
|
"num_tokens": 293080208.0,
|
||
|
|
"step": 682
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3900146484375,
|
||
|
|
"epoch": 2.688976377952756,
|
||
|
|
"grad_norm": 0.6103219460414863,
|
||
|
|
"learning_rate": 5.981298103434696e-07,
|
||
|
|
"loss": 0.314,
|
||
|
|
"mean_token_accuracy": 0.885158559307456,
|
||
|
|
"num_tokens": 293497642.0,
|
||
|
|
"step": 683
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394195556640625,
|
||
|
|
"epoch": 2.6929133858267718,
|
||
|
|
"grad_norm": 0.6218219688195682,
|
||
|
|
"learning_rate": 5.83416250183283e-07,
|
||
|
|
"loss": 0.3215,
|
||
|
|
"mean_token_accuracy": 0.8855108115822077,
|
||
|
|
"num_tokens": 293933914.0,
|
||
|
|
"step": 684
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39300537109375,
|
||
|
|
"epoch": 2.6968503937007875,
|
||
|
|
"grad_norm": 0.633185380507804,
|
||
|
|
"learning_rate": 5.688804835072748e-07,
|
||
|
|
"loss": 0.3008,
|
||
|
|
"mean_token_accuracy": 0.8915995480492711,
|
||
|
|
"num_tokens": 294354925.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39642333984375,
|
||
|
|
"epoch": 2.7007874015748032,
|
||
|
|
"grad_norm": 0.6221615782319272,
|
||
|
|
"learning_rate": 5.545227847636602e-07,
|
||
|
|
"loss": 0.3257,
|
||
|
|
"mean_token_accuracy": 0.8829503497108817,
|
||
|
|
"num_tokens": 294776144.0,
|
||
|
|
"step": 686
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.387420654296875,
|
||
|
|
"epoch": 2.704724409448819,
|
||
|
|
"grad_norm": 0.7005879338266516,
|
||
|
|
"learning_rate": 5.40343425038573e-07,
|
||
|
|
"loss": 0.3199,
|
||
|
|
"mean_token_accuracy": 0.886635722592473,
|
||
|
|
"num_tokens": 295217208.0,
|
||
|
|
"step": 687
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39031982421875,
|
||
|
|
"epoch": 2.7086614173228347,
|
||
|
|
"grad_norm": 0.5926582054950879,
|
||
|
|
"learning_rate": 5.263426720509469e-07,
|
||
|
|
"loss": 0.3178,
|
||
|
|
"mean_token_accuracy": 0.8865943877026439,
|
||
|
|
"num_tokens": 295677828.0,
|
||
|
|
"step": 688
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3997802734375,
|
||
|
|
"epoch": 2.7125984251968505,
|
||
|
|
"grad_norm": 0.6117770278180463,
|
||
|
|
"learning_rate": 5.125207901474638e-07,
|
||
|
|
"loss": 0.3104,
|
||
|
|
"mean_token_accuracy": 0.8866582782939076,
|
||
|
|
"num_tokens": 296105909.0,
|
||
|
|
"step": 689
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.392242431640625,
|
||
|
|
"epoch": 2.716535433070866,
|
||
|
|
"grad_norm": 0.6130135566792134,
|
||
|
|
"learning_rate": 4.98878040297559e-07,
|
||
|
|
"loss": 0.3213,
|
||
|
|
"mean_token_accuracy": 0.883451035246253,
|
||
|
|
"num_tokens": 296530077.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39068603515625,
|
||
|
|
"epoch": 2.720472440944882,
|
||
|
|
"grad_norm": 0.6247127760703306,
|
||
|
|
"learning_rate": 4.854146800884929e-07,
|
||
|
|
"loss": 0.3219,
|
||
|
|
"mean_token_accuracy": 0.8842593487352133,
|
||
|
|
"num_tokens": 296965385.0,
|
||
|
|
"step": 691
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4078369140625,
|
||
|
|
"epoch": 2.7244094488188977,
|
||
|
|
"grad_norm": 0.6192979804386426,
|
||
|
|
"learning_rate": 4.7213096372049404e-07,
|
||
|
|
"loss": 0.3173,
|
||
|
|
"mean_token_accuracy": 0.8843513103201985,
|
||
|
|
"num_tokens": 297381008.0,
|
||
|
|
"step": 692
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402801513671875,
|
||
|
|
"epoch": 2.7283464566929134,
|
||
|
|
"grad_norm": 0.6493879750559709,
|
||
|
|
"learning_rate": 4.59027142001951e-07,
|
||
|
|
"loss": 0.2997,
|
||
|
|
"mean_token_accuracy": 0.8891726117581129,
|
||
|
|
"num_tokens": 297761093.0,
|
||
|
|
"step": 693
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3902587890625,
|
||
|
|
"epoch": 2.732283464566929,
|
||
|
|
"grad_norm": 0.5908426802287144,
|
||
|
|
"learning_rate": 4.461034623446847e-07,
|
||
|
|
"loss": 0.3104,
|
||
|
|
"mean_token_accuracy": 0.8874157816171646,
|
||
|
|
"num_tokens": 298211265.0,
|
||
|
|
"step": 694
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.398223876953125,
|
||
|
|
"epoch": 2.736220472440945,
|
||
|
|
"grad_norm": 0.6216414668892356,
|
||
|
|
"learning_rate": 4.333601687592714e-07,
|
||
|
|
"loss": 0.3073,
|
||
|
|
"mean_token_accuracy": 0.8897824250161648,
|
||
|
|
"num_tokens": 298633043.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391265869140625,
|
||
|
|
"epoch": 2.7401574803149606,
|
||
|
|
"grad_norm": 0.6174935182318937,
|
||
|
|
"learning_rate": 4.2079750185043955e-07,
|
||
|
|
"loss": 0.3194,
|
||
|
|
"mean_token_accuracy": 0.8854984659701586,
|
||
|
|
"num_tokens": 299078818.0,
|
||
|
|
"step": 696
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39642333984375,
|
||
|
|
"epoch": 2.7440944881889764,
|
||
|
|
"grad_norm": 0.617257000557681,
|
||
|
|
"learning_rate": 4.084156988125232e-07,
|
||
|
|
"loss": 0.3398,
|
||
|
|
"mean_token_accuracy": 0.8800790719687939,
|
||
|
|
"num_tokens": 299506515.0,
|
||
|
|
"step": 697
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39508056640625,
|
||
|
|
"epoch": 2.748031496062992,
|
||
|
|
"grad_norm": 0.6355414363515316,
|
||
|
|
"learning_rate": 3.9621499342498706e-07,
|
||
|
|
"loss": 0.2952,
|
||
|
|
"mean_token_accuracy": 0.8933856235817075,
|
||
|
|
"num_tokens": 299906938.0,
|
||
|
|
"step": 698
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.402252197265625,
|
||
|
|
"epoch": 2.751968503937008,
|
||
|
|
"grad_norm": 0.6480768361433473,
|
||
|
|
"learning_rate": 3.841956160480098e-07,
|
||
|
|
"loss": 0.3253,
|
||
|
|
"mean_token_accuracy": 0.8832006398588419,
|
||
|
|
"num_tokens": 300309118.0,
|
||
|
|
"step": 699
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3931884765625,
|
||
|
|
"epoch": 2.7559055118110236,
|
||
|
|
"grad_norm": 0.6200677492498358,
|
||
|
|
"learning_rate": 3.723577936181366e-07,
|
||
|
|
"loss": 0.3143,
|
||
|
|
"mean_token_accuracy": 0.8876901566982269,
|
||
|
|
"num_tokens": 300730067.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39215087890625,
|
||
|
|
"epoch": 2.7598425196850394,
|
||
|
|
"grad_norm": 0.6282804561575106,
|
||
|
|
"learning_rate": 3.607017496439935e-07,
|
||
|
|
"loss": 0.3063,
|
||
|
|
"mean_token_accuracy": 0.8888409864157438,
|
||
|
|
"num_tokens": 301152430.0,
|
||
|
|
"step": 701
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39166259765625,
|
||
|
|
"epoch": 2.763779527559055,
|
||
|
|
"grad_norm": 0.6167008846266515,
|
||
|
|
"learning_rate": 3.4922770420206754e-07,
|
||
|
|
"loss": 0.3138,
|
||
|
|
"mean_token_accuracy": 0.887148299254477,
|
||
|
|
"num_tokens": 301597234.0,
|
||
|
|
"step": 702
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.387939453125,
|
||
|
|
"epoch": 2.767716535433071,
|
||
|
|
"grad_norm": 0.6274220462057958,
|
||
|
|
"learning_rate": 3.3793587393255e-07,
|
||
|
|
"loss": 0.3082,
|
||
|
|
"mean_token_accuracy": 0.889225204475224,
|
||
|
|
"num_tokens": 302034846.0,
|
||
|
|
"step": 703
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397613525390625,
|
||
|
|
"epoch": 2.7716535433070866,
|
||
|
|
"grad_norm": 0.6217284823956639,
|
||
|
|
"learning_rate": 3.2682647203525095e-07,
|
||
|
|
"loss": 0.3091,
|
||
|
|
"mean_token_accuracy": 0.8873699698597193,
|
||
|
|
"num_tokens": 302468902.0,
|
||
|
|
"step": 704
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.391632080078125,
|
||
|
|
"epoch": 2.7755905511811023,
|
||
|
|
"grad_norm": 0.6012726111153172,
|
||
|
|
"learning_rate": 3.158997082655668e-07,
|
||
|
|
"loss": 0.3058,
|
||
|
|
"mean_token_accuracy": 0.8894424652680755,
|
||
|
|
"num_tokens": 302896329.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389434814453125,
|
||
|
|
"epoch": 2.779527559055118,
|
||
|
|
"grad_norm": 0.6234688848554906,
|
||
|
|
"learning_rate": 3.0515578893052343e-07,
|
||
|
|
"loss": 0.318,
|
||
|
|
"mean_token_accuracy": 0.8851411901414394,
|
||
|
|
"num_tokens": 303323908.0,
|
||
|
|
"step": 706
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397430419921875,
|
||
|
|
"epoch": 2.783464566929134,
|
||
|
|
"grad_norm": 0.6061624445272218,
|
||
|
|
"learning_rate": 2.9459491688488604e-07,
|
||
|
|
"loss": 0.3036,
|
||
|
|
"mean_token_accuracy": 0.888351739384234,
|
||
|
|
"num_tokens": 303745436.0,
|
||
|
|
"step": 707
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.399261474609375,
|
||
|
|
"epoch": 2.7874015748031495,
|
||
|
|
"grad_norm": 0.7479126909396716,
|
||
|
|
"learning_rate": 2.8421729152731783e-07,
|
||
|
|
"loss": 0.3268,
|
||
|
|
"mean_token_accuracy": 0.883736445568502,
|
||
|
|
"num_tokens": 304165087.0,
|
||
|
|
"step": 708
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390350341796875,
|
||
|
|
"epoch": 2.7913385826771653,
|
||
|
|
"grad_norm": 0.6211293030549162,
|
||
|
|
"learning_rate": 2.7402310879662497e-07,
|
||
|
|
"loss": 0.3108,
|
||
|
|
"mean_token_accuracy": 0.8857911806553602,
|
||
|
|
"num_tokens": 304597934.0,
|
||
|
|
"step": 709
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395965576171875,
|
||
|
|
"epoch": 2.795275590551181,
|
||
|
|
"grad_norm": 0.6111833255354941,
|
||
|
|
"learning_rate": 2.640125611680522e-07,
|
||
|
|
"loss": 0.307,
|
||
|
|
"mean_token_accuracy": 0.8857949497178197,
|
||
|
|
"num_tokens": 305018870.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.394439697265625,
|
||
|
|
"epoch": 2.7992125984251968,
|
||
|
|
"grad_norm": 0.6245889179074243,
|
||
|
|
"learning_rate": 2.54185837649652e-07,
|
||
|
|
"loss": 0.3146,
|
||
|
|
"mean_token_accuracy": 0.8863369012251496,
|
||
|
|
"num_tokens": 305445367.0,
|
||
|
|
"step": 711
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38995361328125,
|
||
|
|
"epoch": 2.8031496062992125,
|
||
|
|
"grad_norm": 0.6361336623622774,
|
||
|
|
"learning_rate": 2.4454312377871105e-07,
|
||
|
|
"loss": 0.3054,
|
||
|
|
"mean_token_accuracy": 0.8899529185146093,
|
||
|
|
"num_tokens": 305863242.0,
|
||
|
|
"step": 712
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393829345703125,
|
||
|
|
"epoch": 2.8070866141732282,
|
||
|
|
"grad_norm": 0.628109002053711,
|
||
|
|
"learning_rate": 2.3508460161825176e-07,
|
||
|
|
"loss": 0.3193,
|
||
|
|
"mean_token_accuracy": 0.8849193248897791,
|
||
|
|
"num_tokens": 306299748.0,
|
||
|
|
"step": 713
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393280029296875,
|
||
|
|
"epoch": 2.811023622047244,
|
||
|
|
"grad_norm": 0.6279036033740151,
|
||
|
|
"learning_rate": 2.25810449753594e-07,
|
||
|
|
"loss": 0.314,
|
||
|
|
"mean_token_accuracy": 0.8862151158973575,
|
||
|
|
"num_tokens": 306721284.0,
|
||
|
|
"step": 714
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3931884765625,
|
||
|
|
"epoch": 2.8149606299212597,
|
||
|
|
"grad_norm": 0.6192907683317119,
|
||
|
|
"learning_rate": 2.167208432889789e-07,
|
||
|
|
"loss": 0.3112,
|
||
|
|
"mean_token_accuracy": 0.886825337074697,
|
||
|
|
"num_tokens": 307148450.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3929443359375,
|
||
|
|
"epoch": 2.8188976377952755,
|
||
|
|
"grad_norm": 0.6423049938131764,
|
||
|
|
"learning_rate": 2.0781595384427032e-07,
|
||
|
|
"loss": 0.3213,
|
||
|
|
"mean_token_accuracy": 0.8844384793192148,
|
||
|
|
"num_tokens": 307575103.0,
|
||
|
|
"step": 716
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.388519287109375,
|
||
|
|
"epoch": 2.822834645669291,
|
||
|
|
"grad_norm": 0.6895754865141671,
|
||
|
|
"learning_rate": 1.9909594955170752e-07,
|
||
|
|
"loss": 0.3166,
|
||
|
|
"mean_token_accuracy": 0.8848179634660482,
|
||
|
|
"num_tokens": 308030230.0,
|
||
|
|
"step": 717
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39794921875,
|
||
|
|
"epoch": 2.826771653543307,
|
||
|
|
"grad_norm": 0.6307804441913263,
|
||
|
|
"learning_rate": 1.9056099505273428e-07,
|
||
|
|
"loss": 0.3054,
|
||
|
|
"mean_token_accuracy": 0.8902622666209936,
|
||
|
|
"num_tokens": 308450663.0,
|
||
|
|
"step": 718
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389678955078125,
|
||
|
|
"epoch": 2.8307086614173227,
|
||
|
|
"grad_norm": 0.6146486060327043,
|
||
|
|
"learning_rate": 1.8221125149489038e-07,
|
||
|
|
"loss": 0.3064,
|
||
|
|
"mean_token_accuracy": 0.8884834293276072,
|
||
|
|
"num_tokens": 308894935.0,
|
||
|
|
"step": 719
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390594482421875,
|
||
|
|
"epoch": 2.8346456692913384,
|
||
|
|
"grad_norm": 0.6205281415006133,
|
||
|
|
"learning_rate": 1.7404687652876728e-07,
|
||
|
|
"loss": 0.3069,
|
||
|
|
"mean_token_accuracy": 0.8885259497910738,
|
||
|
|
"num_tokens": 309328947.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38934326171875,
|
||
|
|
"epoch": 2.838582677165354,
|
||
|
|
"grad_norm": 0.6507938234415173,
|
||
|
|
"learning_rate": 1.6606802430503166e-07,
|
||
|
|
"loss": 0.3189,
|
||
|
|
"mean_token_accuracy": 0.8856641910970211,
|
||
|
|
"num_tokens": 309766264.0,
|
||
|
|
"step": 721
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390716552734375,
|
||
|
|
"epoch": 2.84251968503937,
|
||
|
|
"grad_norm": 0.6272447591973911,
|
||
|
|
"learning_rate": 1.5827484547151772e-07,
|
||
|
|
"loss": 0.3177,
|
||
|
|
"mean_token_accuracy": 0.8859880482777953,
|
||
|
|
"num_tokens": 310197799.0,
|
||
|
|
"step": 722
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390716552734375,
|
||
|
|
"epoch": 2.846456692913386,
|
||
|
|
"grad_norm": 0.6513111313353903,
|
||
|
|
"learning_rate": 1.506674871703795e-07,
|
||
|
|
"loss": 0.3109,
|
||
|
|
"mean_token_accuracy": 0.8863828666508198,
|
||
|
|
"num_tokens": 310620770.0,
|
||
|
|
"step": 723
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393646240234375,
|
||
|
|
"epoch": 2.850393700787402,
|
||
|
|
"grad_norm": 0.6625745601868105,
|
||
|
|
"learning_rate": 1.43246093035313e-07,
|
||
|
|
"loss": 0.3184,
|
||
|
|
"mean_token_accuracy": 0.8855500891804695,
|
||
|
|
"num_tokens": 311036524.0,
|
||
|
|
"step": 724
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396697998046875,
|
||
|
|
"epoch": 2.8543307086614176,
|
||
|
|
"grad_norm": 0.6566020287580173,
|
||
|
|
"learning_rate": 1.360108031888474e-07,
|
||
|
|
"loss": 0.3299,
|
||
|
|
"mean_token_accuracy": 0.8796233041211963,
|
||
|
|
"num_tokens": 311469275.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3905029296875,
|
||
|
|
"epoch": 2.8582677165354333,
|
||
|
|
"grad_norm": 0.6006475257455051,
|
||
|
|
"learning_rate": 1.2896175423969592e-07,
|
||
|
|
"loss": 0.3115,
|
||
|
|
"mean_token_accuracy": 0.8865141673013568,
|
||
|
|
"num_tokens": 311898980.0,
|
||
|
|
"step": 726
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39263916015625,
|
||
|
|
"epoch": 2.862204724409449,
|
||
|
|
"grad_norm": 0.6191607551639687,
|
||
|
|
"learning_rate": 1.2209907928017794e-07,
|
||
|
|
"loss": 0.3289,
|
||
|
|
"mean_token_accuracy": 0.8829834200441837,
|
||
|
|
"num_tokens": 312348478.0,
|
||
|
|
"step": 727
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38818359375,
|
||
|
|
"epoch": 2.866141732283465,
|
||
|
|
"grad_norm": 0.6791667510376468,
|
||
|
|
"learning_rate": 1.1542290788370547e-07,
|
||
|
|
"loss": 0.327,
|
||
|
|
"mean_token_accuracy": 0.8845472950488329,
|
||
|
|
"num_tokens": 312779963.0,
|
||
|
|
"step": 728
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39227294921875,
|
||
|
|
"epoch": 2.8700787401574805,
|
||
|
|
"grad_norm": 0.6297603203621586,
|
||
|
|
"learning_rate": 1.089333661023373e-07,
|
||
|
|
"loss": 0.3175,
|
||
|
|
"mean_token_accuracy": 0.8854846712201834,
|
||
|
|
"num_tokens": 313230002.0,
|
||
|
|
"step": 729
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393951416015625,
|
||
|
|
"epoch": 2.8740157480314963,
|
||
|
|
"grad_norm": 0.6307950928733317,
|
||
|
|
"learning_rate": 1.02630576464402e-07,
|
||
|
|
"loss": 0.3097,
|
||
|
|
"mean_token_accuracy": 0.8888869564980268,
|
||
|
|
"num_tokens": 313655734.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.4013671875,
|
||
|
|
"epoch": 2.877952755905512,
|
||
|
|
"grad_norm": 0.6300374206111313,
|
||
|
|
"learning_rate": 9.651465797217652e-08,
|
||
|
|
"loss": 0.3016,
|
||
|
|
"mean_token_accuracy": 0.8900213818997145,
|
||
|
|
"num_tokens": 314053306.0,
|
||
|
|
"step": 731
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.397491455078125,
|
||
|
|
"epoch": 2.8818897637795278,
|
||
|
|
"grad_norm": 0.6472776766881085,
|
||
|
|
"learning_rate": 9.058572609964788e-08,
|
||
|
|
"loss": 0.3188,
|
||
|
|
"mean_token_accuracy": 0.8872040212154388,
|
||
|
|
"num_tokens": 314496320.0,
|
||
|
|
"step": 732
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39404296875,
|
||
|
|
"epoch": 2.8858267716535435,
|
||
|
|
"grad_norm": 0.6313115545178322,
|
||
|
|
"learning_rate": 8.484389279032835e-08,
|
||
|
|
"loss": 0.3238,
|
||
|
|
"mean_token_accuracy": 0.88473956938833,
|
||
|
|
"num_tokens": 314930406.0,
|
||
|
|
"step": 733
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.388824462890625,
|
||
|
|
"epoch": 2.8897637795275593,
|
||
|
|
"grad_norm": 0.6382768601008938,
|
||
|
|
"learning_rate": 7.928926645514034e-08,
|
||
|
|
"loss": 0.3052,
|
||
|
|
"mean_token_accuracy": 0.8889888888224959,
|
||
|
|
"num_tokens": 315367644.0,
|
||
|
|
"step": 734
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3909912109375,
|
||
|
|
"epoch": 2.893700787401575,
|
||
|
|
"grad_norm": 0.6284317329218896,
|
||
|
|
"learning_rate": 7.39219519703771e-08,
|
||
|
|
"loss": 0.3191,
|
||
|
|
"mean_token_accuracy": 0.8852613195776939,
|
||
|
|
"num_tokens": 315807014.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39532470703125,
|
||
|
|
"epoch": 2.8976377952755907,
|
||
|
|
"grad_norm": 0.6357283678636514,
|
||
|
|
"learning_rate": 6.874205067571082e-08,
|
||
|
|
"loss": 0.3036,
|
||
|
|
"mean_token_accuracy": 0.8906446024775505,
|
||
|
|
"num_tokens": 316233269.0,
|
||
|
|
"step": 736
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389434814453125,
|
||
|
|
"epoch": 2.9015748031496065,
|
||
|
|
"grad_norm": 0.6074500598261228,
|
||
|
|
"learning_rate": 6.374966037229202e-08,
|
||
|
|
"loss": 0.3162,
|
||
|
|
"mean_token_accuracy": 0.8877827478572726,
|
||
|
|
"num_tokens": 316683927.0,
|
||
|
|
"step": 737
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39202880859375,
|
||
|
|
"epoch": 2.905511811023622,
|
||
|
|
"grad_norm": 0.6689918803322398,
|
||
|
|
"learning_rate": 5.894487532089321e-08,
|
||
|
|
"loss": 0.3108,
|
||
|
|
"mean_token_accuracy": 0.8866195920854807,
|
||
|
|
"num_tokens": 317113161.0,
|
||
|
|
"step": 738
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396728515625,
|
||
|
|
"epoch": 2.909448818897638,
|
||
|
|
"grad_norm": 0.6665066258545567,
|
||
|
|
"learning_rate": 5.4327786240132576e-08,
|
||
|
|
"loss": 0.3087,
|
||
|
|
"mean_token_accuracy": 0.8856972297653556,
|
||
|
|
"num_tokens": 317534548.0,
|
||
|
|
"step": 739
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3934326171875,
|
||
|
|
"epoch": 2.9133858267716537,
|
||
|
|
"grad_norm": 0.6254976882222406,
|
||
|
|
"learning_rate": 4.989848030476307e-08,
|
||
|
|
"loss": 0.3152,
|
||
|
|
"mean_token_accuracy": 0.888129792176187,
|
||
|
|
"num_tokens": 317967477.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39166259765625,
|
||
|
|
"epoch": 2.9173228346456694,
|
||
|
|
"grad_norm": 0.6188976112537805,
|
||
|
|
"learning_rate": 4.5657041144023804e-08,
|
||
|
|
"loss": 0.3131,
|
||
|
|
"mean_token_accuracy": 0.8876488180831075,
|
||
|
|
"num_tokens": 318401623.0,
|
||
|
|
"step": 741
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3941650390625,
|
||
|
|
"epoch": 2.921259842519685,
|
||
|
|
"grad_norm": 0.6224520183112306,
|
||
|
|
"learning_rate": 4.1603548840062344e-08,
|
||
|
|
"loss": 0.3144,
|
||
|
|
"mean_token_accuracy": 0.8860314814373851,
|
||
|
|
"num_tokens": 318829485.0,
|
||
|
|
"step": 742
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389404296875,
|
||
|
|
"epoch": 2.925196850393701,
|
||
|
|
"grad_norm": 0.6621931157658465,
|
||
|
|
"learning_rate": 3.773807992642153e-08,
|
||
|
|
"loss": 0.3119,
|
||
|
|
"mean_token_accuracy": 0.8872817764058709,
|
||
|
|
"num_tokens": 319262158.0,
|
||
|
|
"step": 743
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3955078125,
|
||
|
|
"epoch": 2.9291338582677167,
|
||
|
|
"grad_norm": 0.6448239265983623,
|
||
|
|
"learning_rate": 3.406070738659617e-08,
|
||
|
|
"loss": 0.3119,
|
||
|
|
"mean_token_accuracy": 0.8864805707708001,
|
||
|
|
"num_tokens": 319687504.0,
|
||
|
|
"step": 744
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.388275146484375,
|
||
|
|
"epoch": 2.9330708661417324,
|
||
|
|
"grad_norm": 0.6124809281758588,
|
||
|
|
"learning_rate": 3.0571500652651906e-08,
|
||
|
|
"loss": 0.3146,
|
||
|
|
"mean_token_accuracy": 0.8860159404575825,
|
||
|
|
"num_tokens": 320119340.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.390960693359375,
|
||
|
|
"epoch": 2.937007874015748,
|
||
|
|
"grad_norm": 0.6988960170224446,
|
||
|
|
"learning_rate": 2.7270525603920738e-08,
|
||
|
|
"loss": 0.3181,
|
||
|
|
"mean_token_accuracy": 0.8859586268663406,
|
||
|
|
"num_tokens": 320552288.0,
|
||
|
|
"step": 746
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.395050048828125,
|
||
|
|
"epoch": 2.940944881889764,
|
||
|
|
"grad_norm": 1.1277672104662997,
|
||
|
|
"learning_rate": 2.4157844565747546e-08,
|
||
|
|
"loss": 0.3105,
|
||
|
|
"mean_token_accuracy": 0.8864605631679296,
|
||
|
|
"num_tokens": 320970311.0,
|
||
|
|
"step": 747
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393951416015625,
|
||
|
|
"epoch": 2.9448818897637796,
|
||
|
|
"grad_norm": 0.606549790004319,
|
||
|
|
"learning_rate": 2.1233516308323266e-08,
|
||
|
|
"loss": 0.3196,
|
||
|
|
"mean_token_accuracy": 0.8869588067755103,
|
||
|
|
"num_tokens": 321394036.0,
|
||
|
|
"step": 748
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38861083984375,
|
||
|
|
"epoch": 2.9488188976377954,
|
||
|
|
"grad_norm": 0.610184978078755,
|
||
|
|
"learning_rate": 1.8497596045568002e-08,
|
||
|
|
"loss": 0.3108,
|
||
|
|
"mean_token_accuracy": 0.888460848480463,
|
||
|
|
"num_tokens": 321823512.0,
|
||
|
|
"step": 749
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39996337890625,
|
||
|
|
"epoch": 2.952755905511811,
|
||
|
|
"grad_norm": 0.6265535840932653,
|
||
|
|
"learning_rate": 1.5950135434091853e-08,
|
||
|
|
"loss": 0.3102,
|
||
|
|
"mean_token_accuracy": 0.8862089244648814,
|
||
|
|
"num_tokens": 322231250.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.396820068359375,
|
||
|
|
"epoch": 2.956692913385827,
|
||
|
|
"grad_norm": 0.6406905383870145,
|
||
|
|
"learning_rate": 1.3591182572219031e-08,
|
||
|
|
"loss": 0.3111,
|
||
|
|
"mean_token_accuracy": 0.8872844418510795,
|
||
|
|
"num_tokens": 322651161.0,
|
||
|
|
"step": 751
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39044189453125,
|
||
|
|
"epoch": 2.9606299212598426,
|
||
|
|
"grad_norm": 0.5965161904330141,
|
||
|
|
"learning_rate": 1.14207819990797e-08,
|
||
|
|
"loss": 0.3153,
|
||
|
|
"mean_token_accuracy": 0.8871821071952581,
|
||
|
|
"num_tokens": 323098909.0,
|
||
|
|
"step": 752
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393218994140625,
|
||
|
|
"epoch": 2.9645669291338583,
|
||
|
|
"grad_norm": 0.6127435054185151,
|
||
|
|
"learning_rate": 9.438974693768421e-09,
|
||
|
|
"loss": 0.314,
|
||
|
|
"mean_token_accuracy": 0.8866363558918238,
|
||
|
|
"num_tokens": 323538525.0,
|
||
|
|
"step": 753
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3946533203125,
|
||
|
|
"epoch": 2.968503937007874,
|
||
|
|
"grad_norm": 0.6146105288887688,
|
||
|
|
"learning_rate": 7.645798074572552e-09,
|
||
|
|
"loss": 0.3038,
|
||
|
|
"mean_token_accuracy": 0.8898013606667519,
|
||
|
|
"num_tokens": 323972527.0,
|
||
|
|
"step": 754
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39141845703125,
|
||
|
|
"epoch": 2.97244094488189,
|
||
|
|
"grad_norm": 0.6236575631946764,
|
||
|
|
"learning_rate": 6.0412859982628135e-09,
|
||
|
|
"loss": 0.3336,
|
||
|
|
"mean_token_accuracy": 0.8825578549876809,
|
||
|
|
"num_tokens": 324414207.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38677978515625,
|
||
|
|
"epoch": 2.9763779527559056,
|
||
|
|
"grad_norm": 0.6194778300675513,
|
||
|
|
"learning_rate": 4.6254687594538e-09,
|
||
|
|
"loss": 0.3132,
|
||
|
|
"mean_token_accuracy": 0.8866851180791855,
|
||
|
|
"num_tokens": 324850535.0,
|
||
|
|
"step": 756
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389617919921875,
|
||
|
|
"epoch": 2.9803149606299213,
|
||
|
|
"grad_norm": 0.622216336551404,
|
||
|
|
"learning_rate": 3.3983730900377654e-09,
|
||
|
|
"loss": 0.3062,
|
||
|
|
"mean_token_accuracy": 0.8894290942698717,
|
||
|
|
"num_tokens": 325286762.0,
|
||
|
|
"step": 757
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.389984130859375,
|
||
|
|
"epoch": 2.984251968503937,
|
||
|
|
"grad_norm": 0.5998108891950104,
|
||
|
|
"learning_rate": 2.3600221586717043e-09,
|
||
|
|
"loss": 0.3134,
|
||
|
|
"mean_token_accuracy": 0.8875658120959997,
|
||
|
|
"num_tokens": 325712765.0,
|
||
|
|
"step": 758
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.38818359375,
|
||
|
|
"epoch": 2.9881889763779528,
|
||
|
|
"grad_norm": 0.6160587396973891,
|
||
|
|
"learning_rate": 1.5104355703465801e-09,
|
||
|
|
"loss": 0.3021,
|
||
|
|
"mean_token_accuracy": 0.8908078372478485,
|
||
|
|
"num_tokens": 326158601.0,
|
||
|
|
"step": 759
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.3980712890625,
|
||
|
|
"epoch": 2.9921259842519685,
|
||
|
|
"grad_norm": 0.631334091047954,
|
||
|
|
"learning_rate": 8.496293660120725e-10,
|
||
|
|
"loss": 0.309,
|
||
|
|
"mean_token_accuracy": 0.8900680867955089,
|
||
|
|
"num_tokens": 326581952.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.393585205078125,
|
||
|
|
"epoch": 2.9960629921259843,
|
||
|
|
"grad_norm": 0.6025923101996601,
|
||
|
|
"learning_rate": 3.7761602227903705e-10,
|
||
|
|
"loss": 0.3141,
|
||
|
|
"mean_token_accuracy": 0.885806068778038,
|
||
|
|
"num_tokens": 327014051.0,
|
||
|
|
"step": 761
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 0.39324951171875,
|
||
|
|
"epoch": 3.0,
|
||
|
|
"grad_norm": 0.646299642666546,
|
||
|
|
"learning_rate": 9.44044511796971e-11,
|
||
|
|
"loss": 0.3105,
|
||
|
|
"mean_token_accuracy": 0.8872898044064641,
|
||
|
|
"num_tokens": 327455853.0,
|
||
|
|
"step": 762
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 3.0,
|
||
|
|
"step": 762,
|
||
|
|
"total_flos": 605566838833152.0,
|
||
|
|
"train_loss": 0.4404701322238902,
|
||
|
|
"train_runtime": 58825.4442,
|
||
|
|
"train_samples_per_second": 1.255,
|
||
|
|
"train_steps_per_second": 0.013
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 762,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 3,
|
||
|
|
"save_steps": 64,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 605566838833152.0,
|
||
|
|
"train_batch_size": 1,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|