12057 lines
330 KiB
JSON
12057 lines
330 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 0.5040957781978576,
|
||
|
|
"eval_steps": 3000,
|
||
|
|
"global_step": 6000,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"entropy": 10.742606925964356,
|
||
|
|
"epoch": 0.0004200798151648813,
|
||
|
|
"grad_norm": 5.21875,
|
||
|
|
"learning_rate": 2e-06,
|
||
|
|
"loss": 10.7358,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 8348.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.74260492324829,
|
||
|
|
"epoch": 0.0008401596303297626,
|
||
|
|
"grad_norm": 5.15625,
|
||
|
|
"learning_rate": 4.5e-06,
|
||
|
|
"loss": 10.7547,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 17465.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742631721496583,
|
||
|
|
"epoch": 0.001260239445494644,
|
||
|
|
"grad_norm": 5.25,
|
||
|
|
"learning_rate": 7e-06,
|
||
|
|
"loss": 10.7247,
|
||
|
|
"mean_token_accuracy": 0.00010341261513531208,
|
||
|
|
"num_tokens": 26627.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742714214324952,
|
||
|
|
"epoch": 0.0016803192606595252,
|
||
|
|
"grad_norm": 4.96875,
|
||
|
|
"learning_rate": 9.5e-06,
|
||
|
|
"loss": 10.6807,
|
||
|
|
"mean_token_accuracy": 0.0,
|
||
|
|
"num_tokens": 36069.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742774486541748,
|
||
|
|
"epoch": 0.002100399075824407,
|
||
|
|
"grad_norm": 4.96875,
|
||
|
|
"learning_rate": 1.2e-05,
|
||
|
|
"loss": 10.564,
|
||
|
|
"mean_token_accuracy": 0.0009151221020147204,
|
||
|
|
"num_tokens": 44967.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.742547607421875,
|
||
|
|
"epoch": 0.002520478890989288,
|
||
|
|
"grad_norm": 3.8125,
|
||
|
|
"learning_rate": 1.4500000000000002e-05,
|
||
|
|
"loss": 10.4843,
|
||
|
|
"mean_token_accuracy": 0.0172414593398571,
|
||
|
|
"num_tokens": 55132.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.741770172119141,
|
||
|
|
"epoch": 0.0029405587061541692,
|
||
|
|
"grad_norm": 3.1875,
|
||
|
|
"learning_rate": 1.7000000000000003e-05,
|
||
|
|
"loss": 10.3322,
|
||
|
|
"mean_token_accuracy": 0.044619453698396684,
|
||
|
|
"num_tokens": 65141.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.739381885528564,
|
||
|
|
"epoch": 0.0033606385213190504,
|
||
|
|
"grad_norm": 2.484375,
|
||
|
|
"learning_rate": 1.95e-05,
|
||
|
|
"loss": 10.2048,
|
||
|
|
"mean_token_accuracy": 0.04063304513692856,
|
||
|
|
"num_tokens": 74007.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.735391807556152,
|
||
|
|
"epoch": 0.003780718336483932,
|
||
|
|
"grad_norm": 2.203125,
|
||
|
|
"learning_rate": 2.2e-05,
|
||
|
|
"loss": 10.1027,
|
||
|
|
"mean_token_accuracy": 0.04380051270127296,
|
||
|
|
"num_tokens": 83736.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.731560325622558,
|
||
|
|
"epoch": 0.004200798151648814,
|
||
|
|
"grad_norm": 2.03125,
|
||
|
|
"learning_rate": 2.4500000000000003e-05,
|
||
|
|
"loss": 10.0024,
|
||
|
|
"mean_token_accuracy": 0.04462047629058361,
|
||
|
|
"num_tokens": 92525.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.729215049743653,
|
||
|
|
"epoch": 0.004620877966813695,
|
||
|
|
"grad_norm": 2.046875,
|
||
|
|
"learning_rate": 2.7e-05,
|
||
|
|
"loss": 9.9462,
|
||
|
|
"mean_token_accuracy": 0.042681990377604964,
|
||
|
|
"num_tokens": 102015.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.728453350067138,
|
||
|
|
"epoch": 0.005040957781978576,
|
||
|
|
"grad_norm": 1.7890625,
|
||
|
|
"learning_rate": 2.95e-05,
|
||
|
|
"loss": 9.9154,
|
||
|
|
"mean_token_accuracy": 0.03954915180802345,
|
||
|
|
"num_tokens": 110887.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.727616500854491,
|
||
|
|
"epoch": 0.005461037597143457,
|
||
|
|
"grad_norm": 1.8828125,
|
||
|
|
"learning_rate": 3.2e-05,
|
||
|
|
"loss": 9.8453,
|
||
|
|
"mean_token_accuracy": 0.04232911877334118,
|
||
|
|
"num_tokens": 120442.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.726141738891602,
|
||
|
|
"epoch": 0.0058811174123083385,
|
||
|
|
"grad_norm": 1.9609375,
|
||
|
|
"learning_rate": 3.4500000000000005e-05,
|
||
|
|
"loss": 9.7509,
|
||
|
|
"mean_token_accuracy": 0.041194649040699007,
|
||
|
|
"num_tokens": 129297.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.723711013793945,
|
||
|
|
"epoch": 0.00630119722747322,
|
||
|
|
"grad_norm": 1.8828125,
|
||
|
|
"learning_rate": 3.7e-05,
|
||
|
|
"loss": 9.7015,
|
||
|
|
"mean_token_accuracy": 0.04228766188025475,
|
||
|
|
"num_tokens": 138305.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.719814491271972,
|
||
|
|
"epoch": 0.006721277042638101,
|
||
|
|
"grad_norm": 1.96875,
|
||
|
|
"learning_rate": 3.95e-05,
|
||
|
|
"loss": 9.6499,
|
||
|
|
"mean_token_accuracy": 0.04200226049870252,
|
||
|
|
"num_tokens": 147640.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.714290428161622,
|
||
|
|
"epoch": 0.007141356857802983,
|
||
|
|
"grad_norm": 1.8515625,
|
||
|
|
"learning_rate": 4.2000000000000004e-05,
|
||
|
|
"loss": 9.576,
|
||
|
|
"mean_token_accuracy": 0.04255363866686821,
|
||
|
|
"num_tokens": 157633.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.707357215881348,
|
||
|
|
"epoch": 0.007561436672967864,
|
||
|
|
"grad_norm": 1.671875,
|
||
|
|
"learning_rate": 4.45e-05,
|
||
|
|
"loss": 9.5382,
|
||
|
|
"mean_token_accuracy": 0.03800953794270754,
|
||
|
|
"num_tokens": 167984.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.699947547912597,
|
||
|
|
"epoch": 0.007981516488132745,
|
||
|
|
"grad_norm": 1.7421875,
|
||
|
|
"learning_rate": 4.7000000000000004e-05,
|
||
|
|
"loss": 9.4351,
|
||
|
|
"mean_token_accuracy": 0.04883353523910046,
|
||
|
|
"num_tokens": 176984.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.683709812164306,
|
||
|
|
"epoch": 0.008401596303297627,
|
||
|
|
"grad_norm": 1.890625,
|
||
|
|
"learning_rate": 4.9500000000000004e-05,
|
||
|
|
"loss": 9.3133,
|
||
|
|
"mean_token_accuracy": 0.051684480533003806,
|
||
|
|
"num_tokens": 185931.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.665494632720947,
|
||
|
|
"epoch": 0.008821676118462508,
|
||
|
|
"grad_norm": 1.859375,
|
||
|
|
"learning_rate": 5.2e-05,
|
||
|
|
"loss": 9.2723,
|
||
|
|
"mean_token_accuracy": 0.05058838985860348,
|
||
|
|
"num_tokens": 195065.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.650426483154297,
|
||
|
|
"epoch": 0.00924175593362739,
|
||
|
|
"grad_norm": 1.703125,
|
||
|
|
"learning_rate": 5.45e-05,
|
||
|
|
"loss": 9.1345,
|
||
|
|
"mean_token_accuracy": 0.05380081832408905,
|
||
|
|
"num_tokens": 203687.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.613165855407715,
|
||
|
|
"epoch": 0.00966183574879227,
|
||
|
|
"grad_norm": 1.6484375,
|
||
|
|
"learning_rate": 5.7e-05,
|
||
|
|
"loss": 9.0467,
|
||
|
|
"mean_token_accuracy": 0.057396522164344786,
|
||
|
|
"num_tokens": 212847.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.554168796539306,
|
||
|
|
"epoch": 0.010081915563957152,
|
||
|
|
"grad_norm": 1.6875,
|
||
|
|
"learning_rate": 5.9499999999999996e-05,
|
||
|
|
"loss": 8.93,
|
||
|
|
"mean_token_accuracy": 0.05599412247538567,
|
||
|
|
"num_tokens": 222593.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.50309362411499,
|
||
|
|
"epoch": 0.010501995379122032,
|
||
|
|
"grad_norm": 1.6875,
|
||
|
|
"learning_rate": 6.2e-05,
|
||
|
|
"loss": 8.7842,
|
||
|
|
"mean_token_accuracy": 0.054633737355470655,
|
||
|
|
"num_tokens": 231174.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.446444129943847,
|
||
|
|
"epoch": 0.010922075194286915,
|
||
|
|
"grad_norm": 1.5546875,
|
||
|
|
"learning_rate": 6.450000000000001e-05,
|
||
|
|
"loss": 8.6507,
|
||
|
|
"mean_token_accuracy": 0.05882068388164043,
|
||
|
|
"num_tokens": 239833.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.371571159362793,
|
||
|
|
"epoch": 0.011342155009451797,
|
||
|
|
"grad_norm": 1.53125,
|
||
|
|
"learning_rate": 6.7e-05,
|
||
|
|
"loss": 8.62,
|
||
|
|
"mean_token_accuracy": 0.05638743191957474,
|
||
|
|
"num_tokens": 248794.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.297250938415527,
|
||
|
|
"epoch": 0.011762234824616677,
|
||
|
|
"grad_norm": 1.4375,
|
||
|
|
"learning_rate": 6.950000000000001e-05,
|
||
|
|
"loss": 8.5299,
|
||
|
|
"mean_token_accuracy": 0.056220804899930955,
|
||
|
|
"num_tokens": 257123.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.228730010986329,
|
||
|
|
"epoch": 0.012182314639781559,
|
||
|
|
"grad_norm": 1.453125,
|
||
|
|
"learning_rate": 7.2e-05,
|
||
|
|
"loss": 8.2842,
|
||
|
|
"mean_token_accuracy": 0.05619280487298965,
|
||
|
|
"num_tokens": 266088.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 10.08653745651245,
|
||
|
|
"epoch": 0.01260239445494644,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 7.45e-05,
|
||
|
|
"loss": 8.3619,
|
||
|
|
"mean_token_accuracy": 0.0516346599906683,
|
||
|
|
"num_tokens": 276074.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.963776969909668,
|
||
|
|
"epoch": 0.013022474270111321,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 7.7e-05,
|
||
|
|
"loss": 8.1944,
|
||
|
|
"mean_token_accuracy": 0.054025283083319664,
|
||
|
|
"num_tokens": 285280.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.805997848510742,
|
||
|
|
"epoch": 0.013442554085276202,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 7.950000000000001e-05,
|
||
|
|
"loss": 8.151,
|
||
|
|
"mean_token_accuracy": 0.052671706303954124,
|
||
|
|
"num_tokens": 296115.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.606755542755128,
|
||
|
|
"epoch": 0.013862633900441084,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 8.2e-05,
|
||
|
|
"loss": 7.9584,
|
||
|
|
"mean_token_accuracy": 0.05575060956180096,
|
||
|
|
"num_tokens": 305483.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.449717140197754,
|
||
|
|
"epoch": 0.014282713715605966,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 8.450000000000001e-05,
|
||
|
|
"loss": 7.9165,
|
||
|
|
"mean_token_accuracy": 0.058218777552247046,
|
||
|
|
"num_tokens": 314000.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 9.167982482910157,
|
||
|
|
"epoch": 0.014702793530770846,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 8.7e-05,
|
||
|
|
"loss": 7.8517,
|
||
|
|
"mean_token_accuracy": 0.062257979065179825,
|
||
|
|
"num_tokens": 323667.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.951386070251464,
|
||
|
|
"epoch": 0.015122873345935728,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 8.95e-05,
|
||
|
|
"loss": 7.8029,
|
||
|
|
"mean_token_accuracy": 0.06150264739990234,
|
||
|
|
"num_tokens": 332695.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.776250171661378,
|
||
|
|
"epoch": 0.015542953161100609,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 9.2e-05,
|
||
|
|
"loss": 7.643,
|
||
|
|
"mean_token_accuracy": 0.05887415409088135,
|
||
|
|
"num_tokens": 342428.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.602806949615479,
|
||
|
|
"epoch": 0.01596303297626549,
|
||
|
|
"grad_norm": 0.79296875,
|
||
|
|
"learning_rate": 9.45e-05,
|
||
|
|
"loss": 7.7106,
|
||
|
|
"mean_token_accuracy": 0.06374814324080944,
|
||
|
|
"num_tokens": 353587.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.474033164978028,
|
||
|
|
"epoch": 0.01638311279143037,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 9.7e-05,
|
||
|
|
"loss": 7.6401,
|
||
|
|
"mean_token_accuracy": 0.06406850814819336,
|
||
|
|
"num_tokens": 362997.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.364265060424804,
|
||
|
|
"epoch": 0.016803192606595255,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 9.95e-05,
|
||
|
|
"loss": 7.6617,
|
||
|
|
"mean_token_accuracy": 0.06993534453213215,
|
||
|
|
"num_tokens": 372346.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.375140285491943,
|
||
|
|
"epoch": 0.017223272421760135,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000102,
|
||
|
|
"loss": 7.5334,
|
||
|
|
"mean_token_accuracy": 0.06646758764982223,
|
||
|
|
"num_tokens": 381575.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.26815767288208,
|
||
|
|
"epoch": 0.017643352236925015,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.00010449999999999999,
|
||
|
|
"loss": 7.5902,
|
||
|
|
"mean_token_accuracy": 0.07085754275321961,
|
||
|
|
"num_tokens": 390706.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.218460845947266,
|
||
|
|
"epoch": 0.018063432052089896,
|
||
|
|
"grad_norm": 0.828125,
|
||
|
|
"learning_rate": 0.000107,
|
||
|
|
"loss": 7.5876,
|
||
|
|
"mean_token_accuracy": 0.07221915200352669,
|
||
|
|
"num_tokens": 400000.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.139337062835693,
|
||
|
|
"epoch": 0.01848351186725478,
|
||
|
|
"grad_norm": 0.85546875,
|
||
|
|
"learning_rate": 0.0001095,
|
||
|
|
"loss": 7.5295,
|
||
|
|
"mean_token_accuracy": 0.07644539698958397,
|
||
|
|
"num_tokens": 409447.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.122040271759033,
|
||
|
|
"epoch": 0.01890359168241966,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000112,
|
||
|
|
"loss": 7.5068,
|
||
|
|
"mean_token_accuracy": 0.07519292533397674,
|
||
|
|
"num_tokens": 418417.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.067694330215454,
|
||
|
|
"epoch": 0.01932367149758454,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0001145,
|
||
|
|
"loss": 7.4664,
|
||
|
|
"mean_token_accuracy": 0.07503528967499733,
|
||
|
|
"num_tokens": 427619.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.071773529052734,
|
||
|
|
"epoch": 0.019743751312749424,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.00011700000000000001,
|
||
|
|
"loss": 7.5131,
|
||
|
|
"mean_token_accuracy": 0.07185145244002342,
|
||
|
|
"num_tokens": 437931.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.109980726242066,
|
||
|
|
"epoch": 0.020163831127914304,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.00011949999999999999,
|
||
|
|
"loss": 7.552,
|
||
|
|
"mean_token_accuracy": 0.07611973807215691,
|
||
|
|
"num_tokens": 447595.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.026875400543213,
|
||
|
|
"epoch": 0.020583910943079185,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.000122,
|
||
|
|
"loss": 7.4164,
|
||
|
|
"mean_token_accuracy": 0.07035953775048256,
|
||
|
|
"num_tokens": 457062.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.063331604003906,
|
||
|
|
"epoch": 0.021003990758244065,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0001245,
|
||
|
|
"loss": 7.5166,
|
||
|
|
"mean_token_accuracy": 0.07237975299358368,
|
||
|
|
"num_tokens": 466191.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.050399017333984,
|
||
|
|
"epoch": 0.02142407057340895,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.000127,
|
||
|
|
"loss": 7.4443,
|
||
|
|
"mean_token_accuracy": 0.07492763809859752,
|
||
|
|
"num_tokens": 475693.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 8.024266242980957,
|
||
|
|
"epoch": 0.02184415038857383,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0001295,
|
||
|
|
"loss": 7.4691,
|
||
|
|
"mean_token_accuracy": 0.07379123903810977,
|
||
|
|
"num_tokens": 485173.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.993921422958374,
|
||
|
|
"epoch": 0.02226423020373871,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000132,
|
||
|
|
"loss": 7.3863,
|
||
|
|
"mean_token_accuracy": 0.08008474782109261,
|
||
|
|
"num_tokens": 493985.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.907951974868775,
|
||
|
|
"epoch": 0.022684310018903593,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00013450000000000002,
|
||
|
|
"loss": 7.4036,
|
||
|
|
"mean_token_accuracy": 0.07586845718324184,
|
||
|
|
"num_tokens": 502837.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.981403732299805,
|
||
|
|
"epoch": 0.023104389834068473,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.00013700000000000002,
|
||
|
|
"loss": 7.3605,
|
||
|
|
"mean_token_accuracy": 0.07924394458532333,
|
||
|
|
"num_tokens": 511503.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.977783203125,
|
||
|
|
"epoch": 0.023524469649233354,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0001395,
|
||
|
|
"loss": 7.5335,
|
||
|
|
"mean_token_accuracy": 0.0751778606325388,
|
||
|
|
"num_tokens": 521499.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.871473217010498,
|
||
|
|
"epoch": 0.023944549464398234,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00014199999999999998,
|
||
|
|
"loss": 7.2955,
|
||
|
|
"mean_token_accuracy": 0.0799000546336174,
|
||
|
|
"num_tokens": 530067.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.885423564910889,
|
||
|
|
"epoch": 0.024364629279563118,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0001445,
|
||
|
|
"loss": 7.2851,
|
||
|
|
"mean_token_accuracy": 0.08089336939156055,
|
||
|
|
"num_tokens": 538559.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.956486988067627,
|
||
|
|
"epoch": 0.024784709094728,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000147,
|
||
|
|
"loss": 7.4858,
|
||
|
|
"mean_token_accuracy": 0.07482350952923297,
|
||
|
|
"num_tokens": 547288.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.870783424377441,
|
||
|
|
"epoch": 0.02520478890989288,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0001495,
|
||
|
|
"loss": 7.3589,
|
||
|
|
"mean_token_accuracy": 0.07514288201928139,
|
||
|
|
"num_tokens": 557269.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.939627742767334,
|
||
|
|
"epoch": 0.025624868725057762,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000152,
|
||
|
|
"loss": 7.3914,
|
||
|
|
"mean_token_accuracy": 0.07472754344344139,
|
||
|
|
"num_tokens": 567280.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.828274822235107,
|
||
|
|
"epoch": 0.026044948540222643,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00015450000000000001,
|
||
|
|
"loss": 7.2341,
|
||
|
|
"mean_token_accuracy": 0.07823858335614205,
|
||
|
|
"num_tokens": 576609.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.761577320098877,
|
||
|
|
"epoch": 0.026465028355387523,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000157,
|
||
|
|
"loss": 7.1336,
|
||
|
|
"mean_token_accuracy": 0.08791142702102661,
|
||
|
|
"num_tokens": 586053.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.695616436004639,
|
||
|
|
"epoch": 0.026885108170552403,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0001595,
|
||
|
|
"loss": 7.3339,
|
||
|
|
"mean_token_accuracy": 0.08298731297254562,
|
||
|
|
"num_tokens": 594649.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.869348049163818,
|
||
|
|
"epoch": 0.027305187985717287,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000162,
|
||
|
|
"loss": 7.2862,
|
||
|
|
"mean_token_accuracy": 0.07372522614896297,
|
||
|
|
"num_tokens": 603445.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.86638765335083,
|
||
|
|
"epoch": 0.027725267800882167,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00016450000000000001,
|
||
|
|
"loss": 7.3613,
|
||
|
|
"mean_token_accuracy": 0.07848134562373162,
|
||
|
|
"num_tokens": 613611.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.971248960494995,
|
||
|
|
"epoch": 0.028145347616047048,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00016700000000000002,
|
||
|
|
"loss": 7.5217,
|
||
|
|
"mean_token_accuracy": 0.07931054159998893,
|
||
|
|
"num_tokens": 623024.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.725814580917358,
|
||
|
|
"epoch": 0.02856542743121193,
|
||
|
|
"grad_norm": 1.2734375,
|
||
|
|
"learning_rate": 0.00016950000000000003,
|
||
|
|
"loss": 7.225,
|
||
|
|
"mean_token_accuracy": 0.08345521688461303,
|
||
|
|
"num_tokens": 631624.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.762637519836426,
|
||
|
|
"epoch": 0.028985507246376812,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.00017199999999999998,
|
||
|
|
"loss": 7.1844,
|
||
|
|
"mean_token_accuracy": 0.08410112038254738,
|
||
|
|
"num_tokens": 640473.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.841788578033447,
|
||
|
|
"epoch": 0.029405587061541692,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00017449999999999999,
|
||
|
|
"loss": 7.3409,
|
||
|
|
"mean_token_accuracy": 0.08037517666816711,
|
||
|
|
"num_tokens": 649692.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.800195980072021,
|
||
|
|
"epoch": 0.029825666876706573,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000177,
|
||
|
|
"loss": 7.2995,
|
||
|
|
"mean_token_accuracy": 0.08097823038697242,
|
||
|
|
"num_tokens": 658236.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.668969297409058,
|
||
|
|
"epoch": 0.030245746691871456,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0001795,
|
||
|
|
"loss": 7.0948,
|
||
|
|
"mean_token_accuracy": 0.08619136661291123,
|
||
|
|
"num_tokens": 667175.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.798488330841065,
|
||
|
|
"epoch": 0.030665826507036337,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000182,
|
||
|
|
"loss": 7.3842,
|
||
|
|
"mean_token_accuracy": 0.07823293879628182,
|
||
|
|
"num_tokens": 676456.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.812319660186768,
|
||
|
|
"epoch": 0.031085906322201217,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0001845,
|
||
|
|
"loss": 7.3503,
|
||
|
|
"mean_token_accuracy": 0.07726633399724961,
|
||
|
|
"num_tokens": 686881.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.688674831390381,
|
||
|
|
"epoch": 0.0315059861373661,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000187,
|
||
|
|
"loss": 7.1373,
|
||
|
|
"mean_token_accuracy": 0.0819906547665596,
|
||
|
|
"num_tokens": 696045.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.655067443847656,
|
||
|
|
"epoch": 0.03192606595253098,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0001895,
|
||
|
|
"loss": 7.1112,
|
||
|
|
"mean_token_accuracy": 0.08879919424653053,
|
||
|
|
"num_tokens": 704729.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.4980494499206545,
|
||
|
|
"epoch": 0.032346145767695865,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.000192,
|
||
|
|
"loss": 7.1679,
|
||
|
|
"mean_token_accuracy": 0.07921729236841202,
|
||
|
|
"num_tokens": 714331.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.735121536254883,
|
||
|
|
"epoch": 0.03276622558286074,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0001945,
|
||
|
|
"loss": 7.1229,
|
||
|
|
"mean_token_accuracy": 0.08520057946443557,
|
||
|
|
"num_tokens": 722788.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.683975791931152,
|
||
|
|
"epoch": 0.033186305398025626,
|
||
|
|
"grad_norm": 1.2421875,
|
||
|
|
"learning_rate": 0.00019700000000000002,
|
||
|
|
"loss": 7.1944,
|
||
|
|
"mean_token_accuracy": 0.08690556064248085,
|
||
|
|
"num_tokens": 731417.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.576824569702149,
|
||
|
|
"epoch": 0.03360638521319051,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.00019950000000000002,
|
||
|
|
"loss": 7.1549,
|
||
|
|
"mean_token_accuracy": 0.08151165619492531,
|
||
|
|
"num_tokens": 741034.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.698281908035279,
|
||
|
|
"epoch": 0.034026465028355386,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000202,
|
||
|
|
"loss": 7.156,
|
||
|
|
"mean_token_accuracy": 0.08484743162989616,
|
||
|
|
"num_tokens": 749596.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.556124067306518,
|
||
|
|
"epoch": 0.03444654484352027,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.00020449999999999998,
|
||
|
|
"loss": 7.1145,
|
||
|
|
"mean_token_accuracy": 0.08153974264860153,
|
||
|
|
"num_tokens": 758931.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.533982944488526,
|
||
|
|
"epoch": 0.03486662465868515,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000207,
|
||
|
|
"loss": 7.0206,
|
||
|
|
"mean_token_accuracy": 0.09019657000899314,
|
||
|
|
"num_tokens": 767534.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.6061821460723875,
|
||
|
|
"epoch": 0.03528670447385003,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0002095,
|
||
|
|
"loss": 7.0789,
|
||
|
|
"mean_token_accuracy": 0.08290171101689339,
|
||
|
|
"num_tokens": 776456.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.5107566833496096,
|
||
|
|
"epoch": 0.035706784289014915,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000212,
|
||
|
|
"loss": 7.1362,
|
||
|
|
"mean_token_accuracy": 0.08152465149760246,
|
||
|
|
"num_tokens": 786172.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.553678846359253,
|
||
|
|
"epoch": 0.03612686410417979,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0002145,
|
||
|
|
"loss": 7.0139,
|
||
|
|
"mean_token_accuracy": 0.09106989204883575,
|
||
|
|
"num_tokens": 795081.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.604944372177124,
|
||
|
|
"epoch": 0.036546943919344675,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.00021700000000000002,
|
||
|
|
"loss": 7.0628,
|
||
|
|
"mean_token_accuracy": 0.08461785838007926,
|
||
|
|
"num_tokens": 804259.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.534902191162109,
|
||
|
|
"epoch": 0.03696702373450956,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0002195,
|
||
|
|
"loss": 7.0873,
|
||
|
|
"mean_token_accuracy": 0.08283074498176575,
|
||
|
|
"num_tokens": 813463.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.502531671524048,
|
||
|
|
"epoch": 0.037387103549674436,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000222,
|
||
|
|
"loss": 7.0035,
|
||
|
|
"mean_token_accuracy": 0.09452007561922074,
|
||
|
|
"num_tokens": 823029.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.486780834197998,
|
||
|
|
"epoch": 0.03780718336483932,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0002245,
|
||
|
|
"loss": 7.0727,
|
||
|
|
"mean_token_accuracy": 0.08529324010014534,
|
||
|
|
"num_tokens": 832902.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.476432847976684,
|
||
|
|
"epoch": 0.0382272631800042,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00022700000000000002,
|
||
|
|
"loss": 7.0158,
|
||
|
|
"mean_token_accuracy": 0.08854726403951645,
|
||
|
|
"num_tokens": 842162.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.52789797782898,
|
||
|
|
"epoch": 0.03864734299516908,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.00022950000000000002,
|
||
|
|
"loss": 7.0493,
|
||
|
|
"mean_token_accuracy": 0.08622511699795724,
|
||
|
|
"num_tokens": 852328.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.449561357498169,
|
||
|
|
"epoch": 0.039067422810333964,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00023200000000000003,
|
||
|
|
"loss": 7.0104,
|
||
|
|
"mean_token_accuracy": 0.09133929386734962,
|
||
|
|
"num_tokens": 860929.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.458409357070923,
|
||
|
|
"epoch": 0.03948750262549885,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00023449999999999998,
|
||
|
|
"loss": 7.0901,
|
||
|
|
"mean_token_accuracy": 0.08522843271493911,
|
||
|
|
"num_tokens": 869144.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.584603118896484,
|
||
|
|
"epoch": 0.039907582440663725,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000237,
|
||
|
|
"loss": 7.03,
|
||
|
|
"mean_token_accuracy": 0.09454337358474732,
|
||
|
|
"num_tokens": 877447.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.431310081481934,
|
||
|
|
"epoch": 0.04032766225582861,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0002395,
|
||
|
|
"loss": 6.9871,
|
||
|
|
"mean_token_accuracy": 0.08733554184436798,
|
||
|
|
"num_tokens": 887020.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.453667879104614,
|
||
|
|
"epoch": 0.040747742070993485,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000242,
|
||
|
|
"loss": 7.0323,
|
||
|
|
"mean_token_accuracy": 0.08681000843644142,
|
||
|
|
"num_tokens": 895937.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.41835618019104,
|
||
|
|
"epoch": 0.04116782188615837,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0002445,
|
||
|
|
"loss": 7.0366,
|
||
|
|
"mean_token_accuracy": 0.08261745497584343,
|
||
|
|
"num_tokens": 905446.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.464281463623047,
|
||
|
|
"epoch": 0.04158790170132325,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000247,
|
||
|
|
"loss": 6.9289,
|
||
|
|
"mean_token_accuracy": 0.09576694294810295,
|
||
|
|
"num_tokens": 914547.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.421106290817261,
|
||
|
|
"epoch": 0.04200798151648813,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0002495,
|
||
|
|
"loss": 6.9377,
|
||
|
|
"mean_token_accuracy": 0.0962467186152935,
|
||
|
|
"num_tokens": 922900.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.401471900939941,
|
||
|
|
"epoch": 0.042428061331653014,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.000252,
|
||
|
|
"loss": 6.9572,
|
||
|
|
"mean_token_accuracy": 0.09509932994842529,
|
||
|
|
"num_tokens": 930876.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.342588901519775,
|
||
|
|
"epoch": 0.0428481411468179,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0002545,
|
||
|
|
"loss": 7.0021,
|
||
|
|
"mean_token_accuracy": 0.09231638312339782,
|
||
|
|
"num_tokens": 939871.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.44086856842041,
|
||
|
|
"epoch": 0.043268220961982774,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000257,
|
||
|
|
"loss": 6.988,
|
||
|
|
"mean_token_accuracy": 0.09245615154504776,
|
||
|
|
"num_tokens": 948673.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.274595832824707,
|
||
|
|
"epoch": 0.04368830077714766,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0002595,
|
||
|
|
"loss": 6.9409,
|
||
|
|
"mean_token_accuracy": 0.08984568417072296,
|
||
|
|
"num_tokens": 957603.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.436605787277221,
|
||
|
|
"epoch": 0.04410838059231254,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.000262,
|
||
|
|
"loss": 7.0062,
|
||
|
|
"mean_token_accuracy": 0.08319340422749519,
|
||
|
|
"num_tokens": 967731.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.435907888412475,
|
||
|
|
"epoch": 0.04452846040747742,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00026450000000000003,
|
||
|
|
"loss": 7.0032,
|
||
|
|
"mean_token_accuracy": 0.09049810692667962,
|
||
|
|
"num_tokens": 977427.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.3634380340576175,
|
||
|
|
"epoch": 0.0449485402226423,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.00026700000000000004,
|
||
|
|
"loss": 6.9827,
|
||
|
|
"mean_token_accuracy": 0.0860845424234867,
|
||
|
|
"num_tokens": 986758.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.425018453598023,
|
||
|
|
"epoch": 0.045368620037807186,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.00026950000000000005,
|
||
|
|
"loss": 6.9738,
|
||
|
|
"mean_token_accuracy": 0.09986243322491646,
|
||
|
|
"num_tokens": 996377.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.333861589431763,
|
||
|
|
"epoch": 0.04578869985297206,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.00027200000000000005,
|
||
|
|
"loss": 7.0222,
|
||
|
|
"mean_token_accuracy": 0.08520096391439438,
|
||
|
|
"num_tokens": 1006483.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.269639205932617,
|
||
|
|
"epoch": 0.04620877966813695,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0002745,
|
||
|
|
"loss": 6.9248,
|
||
|
|
"mean_token_accuracy": 0.091129120439291,
|
||
|
|
"num_tokens": 1016132.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.3355879306793215,
|
||
|
|
"epoch": 0.04662885948330183,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000277,
|
||
|
|
"loss": 6.8796,
|
||
|
|
"mean_token_accuracy": 0.09489664137363434,
|
||
|
|
"num_tokens": 1024970.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.3572368144989015,
|
||
|
|
"epoch": 0.04704893929846671,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0002795,
|
||
|
|
"loss": 6.9525,
|
||
|
|
"mean_token_accuracy": 0.09272714778780937,
|
||
|
|
"num_tokens": 1034335.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.423572063446045,
|
||
|
|
"epoch": 0.04746901911363159,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00028199999999999997,
|
||
|
|
"loss": 7.0075,
|
||
|
|
"mean_token_accuracy": 0.09945140630006791,
|
||
|
|
"num_tokens": 1043954.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.319319725036621,
|
||
|
|
"epoch": 0.04788909892879647,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0002845,
|
||
|
|
"loss": 6.9431,
|
||
|
|
"mean_token_accuracy": 0.09524357318878174,
|
||
|
|
"num_tokens": 1053554.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.376662826538086,
|
||
|
|
"epoch": 0.04830917874396135,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000287,
|
||
|
|
"loss": 6.8893,
|
||
|
|
"mean_token_accuracy": 0.0956316351890564,
|
||
|
|
"num_tokens": 1062008.0,
|
||
|
|
"step": 575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.246560859680176,
|
||
|
|
"epoch": 0.048729258559126236,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0002895,
|
||
|
|
"loss": 6.9602,
|
||
|
|
"mean_token_accuracy": 0.09502239599823951,
|
||
|
|
"num_tokens": 1070740.0,
|
||
|
|
"step": 580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.361734390258789,
|
||
|
|
"epoch": 0.04914933837429111,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000292,
|
||
|
|
"loss": 6.9451,
|
||
|
|
"mean_token_accuracy": 0.09238593950867653,
|
||
|
|
"num_tokens": 1079681.0,
|
||
|
|
"step": 585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.294089078903198,
|
||
|
|
"epoch": 0.049569418189456,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0002945,
|
||
|
|
"loss": 6.8326,
|
||
|
|
"mean_token_accuracy": 0.09609337821602822,
|
||
|
|
"num_tokens": 1088979.0,
|
||
|
|
"step": 590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.192009592056275,
|
||
|
|
"epoch": 0.04998949800462088,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.000297,
|
||
|
|
"loss": 6.8381,
|
||
|
|
"mean_token_accuracy": 0.09695586860179901,
|
||
|
|
"num_tokens": 1097870.0,
|
||
|
|
"step": 595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.285109043121338,
|
||
|
|
"epoch": 0.05040957781978576,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0002995,
|
||
|
|
"loss": 6.9361,
|
||
|
|
"mean_token_accuracy": 0.09410082027316094,
|
||
|
|
"num_tokens": 1107948.0,
|
||
|
|
"step": 600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2816235542297365,
|
||
|
|
"epoch": 0.05082965763495064,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000302,
|
||
|
|
"loss": 6.856,
|
||
|
|
"mean_token_accuracy": 0.09758619442582131,
|
||
|
|
"num_tokens": 1117032.0,
|
||
|
|
"step": 605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.1946680545806885,
|
||
|
|
"epoch": 0.051249737450115525,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0003045,
|
||
|
|
"loss": 6.8323,
|
||
|
|
"mean_token_accuracy": 0.09758584424853325,
|
||
|
|
"num_tokens": 1127834.0,
|
||
|
|
"step": 610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.325930643081665,
|
||
|
|
"epoch": 0.0516698172652804,
|
||
|
|
"grad_norm": 1.234375,
|
||
|
|
"learning_rate": 0.000307,
|
||
|
|
"loss": 6.9314,
|
||
|
|
"mean_token_accuracy": 0.10701763778924941,
|
||
|
|
"num_tokens": 1137382.0,
|
||
|
|
"step": 615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.191529178619385,
|
||
|
|
"epoch": 0.052089897080445285,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0003095,
|
||
|
|
"loss": 6.7726,
|
||
|
|
"mean_token_accuracy": 0.1016211412847042,
|
||
|
|
"num_tokens": 1146095.0,
|
||
|
|
"step": 620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.197086191177368,
|
||
|
|
"epoch": 0.05250997689561017,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000312,
|
||
|
|
"loss": 6.8164,
|
||
|
|
"mean_token_accuracy": 0.09977484568953514,
|
||
|
|
"num_tokens": 1154981.0,
|
||
|
|
"step": 625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.111207914352417,
|
||
|
|
"epoch": 0.052930056710775046,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.0003145,
|
||
|
|
"loss": 6.822,
|
||
|
|
"mean_token_accuracy": 0.09889646545052529,
|
||
|
|
"num_tokens": 1164939.0,
|
||
|
|
"step": 630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.286598014831543,
|
||
|
|
"epoch": 0.05335013652593993,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000317,
|
||
|
|
"loss": 6.9423,
|
||
|
|
"mean_token_accuracy": 0.0905054323375225,
|
||
|
|
"num_tokens": 1174991.0,
|
||
|
|
"step": 635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.268424129486084,
|
||
|
|
"epoch": 0.05377021634110481,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0003195,
|
||
|
|
"loss": 6.9893,
|
||
|
|
"mean_token_accuracy": 0.09030458927154542,
|
||
|
|
"num_tokens": 1184885.0,
|
||
|
|
"step": 640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.25072751045227,
|
||
|
|
"epoch": 0.05419029615626969,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.000322,
|
||
|
|
"loss": 6.8843,
|
||
|
|
"mean_token_accuracy": 0.09418094158172607,
|
||
|
|
"num_tokens": 1193637.0,
|
||
|
|
"step": 645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.144441413879394,
|
||
|
|
"epoch": 0.054610375971434574,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00032450000000000003,
|
||
|
|
"loss": 6.6712,
|
||
|
|
"mean_token_accuracy": 0.10373484939336777,
|
||
|
|
"num_tokens": 1202188.0,
|
||
|
|
"step": 650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.2327552318573,
|
||
|
|
"epoch": 0.05503045578659945,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.00032700000000000003,
|
||
|
|
"loss": 6.8046,
|
||
|
|
"mean_token_accuracy": 0.09572408124804496,
|
||
|
|
"num_tokens": 1210768.0,
|
||
|
|
"step": 655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.196833848953247,
|
||
|
|
"epoch": 0.055450535601764335,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.00032950000000000004,
|
||
|
|
"loss": 6.8024,
|
||
|
|
"mean_token_accuracy": 0.09782998114824296,
|
||
|
|
"num_tokens": 1219819.0,
|
||
|
|
"step": 660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.211909484863281,
|
||
|
|
"epoch": 0.05587061541692922,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.00033200000000000005,
|
||
|
|
"loss": 6.8553,
|
||
|
|
"mean_token_accuracy": 0.09061138033866882,
|
||
|
|
"num_tokens": 1229703.0,
|
||
|
|
"step": 665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.242569494247436,
|
||
|
|
"epoch": 0.056290695232094096,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.00033450000000000005,
|
||
|
|
"loss": 6.8929,
|
||
|
|
"mean_token_accuracy": 0.09304608702659607,
|
||
|
|
"num_tokens": 1238942.0,
|
||
|
|
"step": 670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.276552438735962,
|
||
|
|
"epoch": 0.05671077504725898,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000337,
|
||
|
|
"loss": 6.9316,
|
||
|
|
"mean_token_accuracy": 0.09855509251356125,
|
||
|
|
"num_tokens": 1248943.0,
|
||
|
|
"step": 675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.130473899841308,
|
||
|
|
"epoch": 0.05713085486242386,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0003395,
|
||
|
|
"loss": 6.8196,
|
||
|
|
"mean_token_accuracy": 0.09641827270388603,
|
||
|
|
"num_tokens": 1257761.0,
|
||
|
|
"step": 680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.069635629653931,
|
||
|
|
"epoch": 0.05755093467758874,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000342,
|
||
|
|
"loss": 6.7531,
|
||
|
|
"mean_token_accuracy": 0.09635655134916306,
|
||
|
|
"num_tokens": 1267216.0,
|
||
|
|
"step": 685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.244167423248291,
|
||
|
|
"epoch": 0.057971014492753624,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00034449999999999997,
|
||
|
|
"loss": 6.8517,
|
||
|
|
"mean_token_accuracy": 0.09775793552398682,
|
||
|
|
"num_tokens": 1277210.0,
|
||
|
|
"step": 690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.151098155975342,
|
||
|
|
"epoch": 0.05839109430791851,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000347,
|
||
|
|
"loss": 6.7848,
|
||
|
|
"mean_token_accuracy": 0.09209914952516556,
|
||
|
|
"num_tokens": 1285310.0,
|
||
|
|
"step": 695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.133235788345337,
|
||
|
|
"epoch": 0.058811174123083385,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0003495,
|
||
|
|
"loss": 6.7884,
|
||
|
|
"mean_token_accuracy": 0.0997276745736599,
|
||
|
|
"num_tokens": 1294421.0,
|
||
|
|
"step": 700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.089715480804443,
|
||
|
|
"epoch": 0.05923125393824827,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000352,
|
||
|
|
"loss": 6.6149,
|
||
|
|
"mean_token_accuracy": 0.10670206919312478,
|
||
|
|
"num_tokens": 1303281.0,
|
||
|
|
"step": 705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.096017217636108,
|
||
|
|
"epoch": 0.059651333753413145,
|
||
|
|
"grad_norm": 1.3046875,
|
||
|
|
"learning_rate": 0.0003545,
|
||
|
|
"loss": 6.7841,
|
||
|
|
"mean_token_accuracy": 0.1047137551009655,
|
||
|
|
"num_tokens": 1312280.0,
|
||
|
|
"step": 710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.01336669921875,
|
||
|
|
"epoch": 0.06007141356857803,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000357,
|
||
|
|
"loss": 6.7519,
|
||
|
|
"mean_token_accuracy": 0.09830996096134186,
|
||
|
|
"num_tokens": 1321243.0,
|
||
|
|
"step": 715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.150788021087647,
|
||
|
|
"epoch": 0.06049149338374291,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0003595,
|
||
|
|
"loss": 6.8411,
|
||
|
|
"mean_token_accuracy": 0.0983475923538208,
|
||
|
|
"num_tokens": 1330324.0,
|
||
|
|
"step": 720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.074830770492554,
|
||
|
|
"epoch": 0.06091157319890779,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.000362,
|
||
|
|
"loss": 6.6865,
|
||
|
|
"mean_token_accuracy": 0.1045832097530365,
|
||
|
|
"num_tokens": 1339485.0,
|
||
|
|
"step": 725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.180077934265137,
|
||
|
|
"epoch": 0.06133165301407267,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.0003645,
|
||
|
|
"loss": 6.8327,
|
||
|
|
"mean_token_accuracy": 0.09178336262702942,
|
||
|
|
"num_tokens": 1348640.0,
|
||
|
|
"step": 730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.070912313461304,
|
||
|
|
"epoch": 0.06175173282923756,
|
||
|
|
"grad_norm": 1.203125,
|
||
|
|
"learning_rate": 0.000367,
|
||
|
|
"loss": 6.7313,
|
||
|
|
"mean_token_accuracy": 0.10252036228775978,
|
||
|
|
"num_tokens": 1357581.0,
|
||
|
|
"step": 735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.097622108459473,
|
||
|
|
"epoch": 0.062171812644402434,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0003695,
|
||
|
|
"loss": 6.7976,
|
||
|
|
"mean_token_accuracy": 0.09888288527727127,
|
||
|
|
"num_tokens": 1367883.0,
|
||
|
|
"step": 740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.072182083129883,
|
||
|
|
"epoch": 0.06259189245956731,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000372,
|
||
|
|
"loss": 6.7536,
|
||
|
|
"mean_token_accuracy": 0.09760352596640587,
|
||
|
|
"num_tokens": 1376936.0,
|
||
|
|
"step": 745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.975026559829712,
|
||
|
|
"epoch": 0.0630119722747322,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0003745,
|
||
|
|
"loss": 6.6653,
|
||
|
|
"mean_token_accuracy": 0.10172178596258163,
|
||
|
|
"num_tokens": 1386359.0,
|
||
|
|
"step": 750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.0470263957977295,
|
||
|
|
"epoch": 0.06343205208989708,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000377,
|
||
|
|
"loss": 6.7205,
|
||
|
|
"mean_token_accuracy": 0.10334330797195435,
|
||
|
|
"num_tokens": 1395223.0,
|
||
|
|
"step": 755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.237481212615966,
|
||
|
|
"epoch": 0.06385213190506196,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0003795,
|
||
|
|
"loss": 6.8854,
|
||
|
|
"mean_token_accuracy": 0.09526007026433944,
|
||
|
|
"num_tokens": 1404917.0,
|
||
|
|
"step": 760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.060393810272217,
|
||
|
|
"epoch": 0.06427221172022685,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.000382,
|
||
|
|
"loss": 6.7712,
|
||
|
|
"mean_token_accuracy": 0.10844952017068862,
|
||
|
|
"num_tokens": 1413348.0,
|
||
|
|
"step": 765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.010181617736817,
|
||
|
|
"epoch": 0.06469229153539173,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0003845,
|
||
|
|
"loss": 6.751,
|
||
|
|
"mean_token_accuracy": 0.0988110676407814,
|
||
|
|
"num_tokens": 1421726.0,
|
||
|
|
"step": 770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.068030214309692,
|
||
|
|
"epoch": 0.0651123713505566,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.00038700000000000003,
|
||
|
|
"loss": 6.7626,
|
||
|
|
"mean_token_accuracy": 0.10152493417263031,
|
||
|
|
"num_tokens": 1430686.0,
|
||
|
|
"step": 775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.124918842315674,
|
||
|
|
"epoch": 0.06553245116572148,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.00038950000000000003,
|
||
|
|
"loss": 6.7567,
|
||
|
|
"mean_token_accuracy": 0.10261558443307876,
|
||
|
|
"num_tokens": 1439499.0,
|
||
|
|
"step": 780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.08576397895813,
|
||
|
|
"epoch": 0.06595253098088637,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00039200000000000004,
|
||
|
|
"loss": 6.7308,
|
||
|
|
"mean_token_accuracy": 0.10436978489160538,
|
||
|
|
"num_tokens": 1448220.0,
|
||
|
|
"step": 785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.918930721282959,
|
||
|
|
"epoch": 0.06637261079605125,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.00039450000000000005,
|
||
|
|
"loss": 6.7623,
|
||
|
|
"mean_token_accuracy": 0.09306630715727807,
|
||
|
|
"num_tokens": 1458217.0,
|
||
|
|
"step": 790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.050667333602905,
|
||
|
|
"epoch": 0.06679269061121614,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.00039700000000000005,
|
||
|
|
"loss": 6.6615,
|
||
|
|
"mean_token_accuracy": 0.10148273557424545,
|
||
|
|
"num_tokens": 1467422.0,
|
||
|
|
"step": 795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.04574761390686,
|
||
|
|
"epoch": 0.06721277042638102,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0003995,
|
||
|
|
"loss": 6.6428,
|
||
|
|
"mean_token_accuracy": 0.10174536257982254,
|
||
|
|
"num_tokens": 1476152.0,
|
||
|
|
"step": 800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.920849370956421,
|
||
|
|
"epoch": 0.06763285024154589,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.000402,
|
||
|
|
"loss": 6.7303,
|
||
|
|
"mean_token_accuracy": 0.09813930094242096,
|
||
|
|
"num_tokens": 1485248.0,
|
||
|
|
"step": 805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.021937704086303,
|
||
|
|
"epoch": 0.06805293005671077,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004045,
|
||
|
|
"loss": 6.6965,
|
||
|
|
"mean_token_accuracy": 0.10005066767334939,
|
||
|
|
"num_tokens": 1494248.0,
|
||
|
|
"step": 810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.009239387512207,
|
||
|
|
"epoch": 0.06847300987187566,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.00040699999999999997,
|
||
|
|
"loss": 6.7988,
|
||
|
|
"mean_token_accuracy": 0.10206111744046212,
|
||
|
|
"num_tokens": 1503565.0,
|
||
|
|
"step": 815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.153907108306885,
|
||
|
|
"epoch": 0.06889308968704054,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004095,
|
||
|
|
"loss": 6.8967,
|
||
|
|
"mean_token_accuracy": 0.09253153279423713,
|
||
|
|
"num_tokens": 1513227.0,
|
||
|
|
"step": 820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.081949377059937,
|
||
|
|
"epoch": 0.06931316950220542,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000412,
|
||
|
|
"loss": 6.6785,
|
||
|
|
"mean_token_accuracy": 0.10418465957045556,
|
||
|
|
"num_tokens": 1522312.0,
|
||
|
|
"step": 825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.934855031967163,
|
||
|
|
"epoch": 0.0697332493173703,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004145,
|
||
|
|
"loss": 6.6359,
|
||
|
|
"mean_token_accuracy": 0.1031254269182682,
|
||
|
|
"num_tokens": 1531720.0,
|
||
|
|
"step": 830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.970464134216309,
|
||
|
|
"epoch": 0.07015332913253518,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.000417,
|
||
|
|
"loss": 6.7192,
|
||
|
|
"mean_token_accuracy": 0.09493932947516441,
|
||
|
|
"num_tokens": 1541238.0,
|
||
|
|
"step": 835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.103578281402588,
|
||
|
|
"epoch": 0.07057340894770006,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004195,
|
||
|
|
"loss": 6.8114,
|
||
|
|
"mean_token_accuracy": 0.0987453043460846,
|
||
|
|
"num_tokens": 1550875.0,
|
||
|
|
"step": 840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.948361873626709,
|
||
|
|
"epoch": 0.07099348876286495,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000422,
|
||
|
|
"loss": 6.7522,
|
||
|
|
"mean_token_accuracy": 0.10080962181091309,
|
||
|
|
"num_tokens": 1560287.0,
|
||
|
|
"step": 845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.981166744232178,
|
||
|
|
"epoch": 0.07141356857802983,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004245,
|
||
|
|
"loss": 6.6378,
|
||
|
|
"mean_token_accuracy": 0.10372715294361115,
|
||
|
|
"num_tokens": 1569043.0,
|
||
|
|
"step": 850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.902826881408691,
|
||
|
|
"epoch": 0.07183364839319471,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000427,
|
||
|
|
"loss": 6.6697,
|
||
|
|
"mean_token_accuracy": 0.10197147876024246,
|
||
|
|
"num_tokens": 1578112.0,
|
||
|
|
"step": 855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.874331331253051,
|
||
|
|
"epoch": 0.07225372820835958,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004295,
|
||
|
|
"loss": 6.5725,
|
||
|
|
"mean_token_accuracy": 0.1078405149281025,
|
||
|
|
"num_tokens": 1586587.0,
|
||
|
|
"step": 860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.059461355209351,
|
||
|
|
"epoch": 0.07267380802352447,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000432,
|
||
|
|
"loss": 6.7397,
|
||
|
|
"mean_token_accuracy": 0.09989926218986511,
|
||
|
|
"num_tokens": 1595585.0,
|
||
|
|
"step": 865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.951946210861206,
|
||
|
|
"epoch": 0.07309388783868935,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004345,
|
||
|
|
"loss": 6.6946,
|
||
|
|
"mean_token_accuracy": 0.10353797450661659,
|
||
|
|
"num_tokens": 1605355.0,
|
||
|
|
"step": 870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.944614362716675,
|
||
|
|
"epoch": 0.07351396765385423,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000437,
|
||
|
|
"loss": 6.7108,
|
||
|
|
"mean_token_accuracy": 0.09883329644799232,
|
||
|
|
"num_tokens": 1613637.0,
|
||
|
|
"step": 875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.975859832763672,
|
||
|
|
"epoch": 0.07393404746901912,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004395,
|
||
|
|
"loss": 6.6703,
|
||
|
|
"mean_token_accuracy": 0.10343916267156601,
|
||
|
|
"num_tokens": 1622731.0,
|
||
|
|
"step": 880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.003747940063477,
|
||
|
|
"epoch": 0.074354127284184,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000442,
|
||
|
|
"loss": 6.6373,
|
||
|
|
"mean_token_accuracy": 0.10040950924158096,
|
||
|
|
"num_tokens": 1632098.0,
|
||
|
|
"step": 885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.826285457611084,
|
||
|
|
"epoch": 0.07477420709934887,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004445,
|
||
|
|
"loss": 6.6454,
|
||
|
|
"mean_token_accuracy": 0.09755287617444992,
|
||
|
|
"num_tokens": 1641259.0,
|
||
|
|
"step": 890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.0150947093963625,
|
||
|
|
"epoch": 0.07519428691451376,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.000447,
|
||
|
|
"loss": 6.7262,
|
||
|
|
"mean_token_accuracy": 0.09560549557209015,
|
||
|
|
"num_tokens": 1651362.0,
|
||
|
|
"step": 895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.897852563858033,
|
||
|
|
"epoch": 0.07561436672967864,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.00044950000000000003,
|
||
|
|
"loss": 6.6487,
|
||
|
|
"mean_token_accuracy": 0.10112505033612251,
|
||
|
|
"num_tokens": 1660190.0,
|
||
|
|
"step": 900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.90705189704895,
|
||
|
|
"epoch": 0.07603444654484352,
|
||
|
|
"grad_norm": 1.1953125,
|
||
|
|
"learning_rate": 0.00045200000000000004,
|
||
|
|
"loss": 6.663,
|
||
|
|
"mean_token_accuracy": 0.10142350941896439,
|
||
|
|
"num_tokens": 1669020.0,
|
||
|
|
"step": 905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.973592853546142,
|
||
|
|
"epoch": 0.0764545263600084,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00045450000000000004,
|
||
|
|
"loss": 6.6861,
|
||
|
|
"mean_token_accuracy": 0.1048488400876522,
|
||
|
|
"num_tokens": 1678158.0,
|
||
|
|
"step": 910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.985338020324707,
|
||
|
|
"epoch": 0.07687460617517328,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.00045700000000000005,
|
||
|
|
"loss": 6.7084,
|
||
|
|
"mean_token_accuracy": 0.10136276260018348,
|
||
|
|
"num_tokens": 1687481.0,
|
||
|
|
"step": 915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.876794004440308,
|
||
|
|
"epoch": 0.07729468599033816,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.00045950000000000006,
|
||
|
|
"loss": 6.6666,
|
||
|
|
"mean_token_accuracy": 0.10845559537410736,
|
||
|
|
"num_tokens": 1696782.0,
|
||
|
|
"step": 920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.932897567749023,
|
||
|
|
"epoch": 0.07771476580550304,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000462,
|
||
|
|
"loss": 6.6725,
|
||
|
|
"mean_token_accuracy": 0.10497085899114608,
|
||
|
|
"num_tokens": 1706153.0,
|
||
|
|
"step": 925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.9077776908874515,
|
||
|
|
"epoch": 0.07813484562066793,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004645,
|
||
|
|
"loss": 6.6889,
|
||
|
|
"mean_token_accuracy": 0.10281107649207115,
|
||
|
|
"num_tokens": 1715585.0,
|
||
|
|
"step": 930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.106683778762817,
|
||
|
|
"epoch": 0.07855492543583281,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.000467,
|
||
|
|
"loss": 6.8042,
|
||
|
|
"mean_token_accuracy": 0.10099845305085182,
|
||
|
|
"num_tokens": 1724857.0,
|
||
|
|
"step": 935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.858903789520264,
|
||
|
|
"epoch": 0.0789750052509977,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004695,
|
||
|
|
"loss": 6.6175,
|
||
|
|
"mean_token_accuracy": 0.10900806412100791,
|
||
|
|
"num_tokens": 1733528.0,
|
||
|
|
"step": 940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 7.006282758712769,
|
||
|
|
"epoch": 0.07939508506616257,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.000472,
|
||
|
|
"loss": 6.7383,
|
||
|
|
"mean_token_accuracy": 0.10379872918128967,
|
||
|
|
"num_tokens": 1742953.0,
|
||
|
|
"step": 945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.92790584564209,
|
||
|
|
"epoch": 0.07981516488132745,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004745,
|
||
|
|
"loss": 6.6988,
|
||
|
|
"mean_token_accuracy": 0.10636084228754043,
|
||
|
|
"num_tokens": 1752155.0,
|
||
|
|
"step": 950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.911950254440308,
|
||
|
|
"epoch": 0.08023524469649233,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.000477,
|
||
|
|
"loss": 6.5687,
|
||
|
|
"mean_token_accuracy": 0.10838210806250573,
|
||
|
|
"num_tokens": 1760562.0,
|
||
|
|
"step": 955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.83457088470459,
|
||
|
|
"epoch": 0.08065532451165722,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004795,
|
||
|
|
"loss": 6.5891,
|
||
|
|
"mean_token_accuracy": 0.10088410004973411,
|
||
|
|
"num_tokens": 1769631.0,
|
||
|
|
"step": 960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.914610385894775,
|
||
|
|
"epoch": 0.0810754043268221,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.000482,
|
||
|
|
"loss": 6.6346,
|
||
|
|
"mean_token_accuracy": 0.10217849463224411,
|
||
|
|
"num_tokens": 1779080.0,
|
||
|
|
"step": 965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8898755550384525,
|
||
|
|
"epoch": 0.08149548414198697,
|
||
|
|
"grad_norm": 1.296875,
|
||
|
|
"learning_rate": 0.0004845,
|
||
|
|
"loss": 6.6271,
|
||
|
|
"mean_token_accuracy": 0.10570115596055984,
|
||
|
|
"num_tokens": 1787830.0,
|
||
|
|
"step": 970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.751455068588257,
|
||
|
|
"epoch": 0.08191556395715185,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.000487,
|
||
|
|
"loss": 6.5346,
|
||
|
|
"mean_token_accuracy": 0.10223312452435493,
|
||
|
|
"num_tokens": 1796998.0,
|
||
|
|
"step": 975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8943780899047855,
|
||
|
|
"epoch": 0.08233564377231674,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004895,
|
||
|
|
"loss": 6.6202,
|
||
|
|
"mean_token_accuracy": 0.10597362667322159,
|
||
|
|
"num_tokens": 1806194.0,
|
||
|
|
"step": 980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.700069093704224,
|
||
|
|
"epoch": 0.08275572358748162,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.000492,
|
||
|
|
"loss": 6.5072,
|
||
|
|
"mean_token_accuracy": 0.10932167768478393,
|
||
|
|
"num_tokens": 1815751.0,
|
||
|
|
"step": 985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.749313592910767,
|
||
|
|
"epoch": 0.0831758034026465,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004945,
|
||
|
|
"loss": 6.5857,
|
||
|
|
"mean_token_accuracy": 0.10682184919714928,
|
||
|
|
"num_tokens": 1825379.0,
|
||
|
|
"step": 990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.845586490631104,
|
||
|
|
"epoch": 0.08359588321781139,
|
||
|
|
"grad_norm": 1.1328125,
|
||
|
|
"learning_rate": 0.000497,
|
||
|
|
"loss": 6.5541,
|
||
|
|
"mean_token_accuracy": 0.10507402196526527,
|
||
|
|
"num_tokens": 1834158.0,
|
||
|
|
"step": 995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.844553852081299,
|
||
|
|
"epoch": 0.08401596303297626,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004995,
|
||
|
|
"loss": 6.5161,
|
||
|
|
"mean_token_accuracy": 0.10857650190591812,
|
||
|
|
"num_tokens": 1842724.0,
|
||
|
|
"step": 1000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.795124101638794,
|
||
|
|
"epoch": 0.08443604284814114,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000499999998724557,
|
||
|
|
"loss": 6.5362,
|
||
|
|
"mean_token_accuracy": 0.10392995700240135,
|
||
|
|
"num_tokens": 1852485.0,
|
||
|
|
"step": 1005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.765092468261718,
|
||
|
|
"epoch": 0.08485612266330603,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999999935430703,
|
||
|
|
"loss": 6.575,
|
||
|
|
"mean_token_accuracy": 0.10723726153373718,
|
||
|
|
"num_tokens": 1861303.0,
|
||
|
|
"step": 1010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.745694637298584,
|
||
|
|
"epoch": 0.08527620247847091,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999999843758243,
|
||
|
|
"loss": 6.5409,
|
||
|
|
"mean_token_accuracy": 0.1151320680975914,
|
||
|
|
"num_tokens": 1870859.0,
|
||
|
|
"step": 1015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8996889114379885,
|
||
|
|
"epoch": 0.0856962822936358,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999999712228196,
|
||
|
|
"loss": 6.7032,
|
||
|
|
"mean_token_accuracy": 0.10041022300720215,
|
||
|
|
"num_tokens": 1880295.0,
|
||
|
|
"step": 1020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.899116802215576,
|
||
|
|
"epoch": 0.08611636210880068,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999999540840562,
|
||
|
|
"loss": 6.6176,
|
||
|
|
"mean_token_accuracy": 0.10147540494799615,
|
||
|
|
"num_tokens": 1889193.0,
|
||
|
|
"step": 1025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.797919845581054,
|
||
|
|
"epoch": 0.08653644192396555,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999999329595345,
|
||
|
|
"loss": 6.709,
|
||
|
|
"mean_token_accuracy": 0.09875654354691506,
|
||
|
|
"num_tokens": 1899437.0,
|
||
|
|
"step": 1030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.910034608840943,
|
||
|
|
"epoch": 0.08695652173913043,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999999078492548,
|
||
|
|
"loss": 6.6032,
|
||
|
|
"mean_token_accuracy": 0.10777303576469421,
|
||
|
|
"num_tokens": 1907882.0,
|
||
|
|
"step": 1035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.728742361068726,
|
||
|
|
"epoch": 0.08737660155429532,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004999998787532176,
|
||
|
|
"loss": 6.5131,
|
||
|
|
"mean_token_accuracy": 0.1080910786986351,
|
||
|
|
"num_tokens": 1916872.0,
|
||
|
|
"step": 1040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.86653618812561,
|
||
|
|
"epoch": 0.0877966813694602,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999998456714234,
|
||
|
|
"loss": 6.6681,
|
||
|
|
"mean_token_accuracy": 0.1074354499578476,
|
||
|
|
"num_tokens": 1926636.0,
|
||
|
|
"step": 1045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.773524904251099,
|
||
|
|
"epoch": 0.08821676118462508,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004999998086038729,
|
||
|
|
"loss": 6.5697,
|
||
|
|
"mean_token_accuracy": 0.108617003262043,
|
||
|
|
"num_tokens": 1935962.0,
|
||
|
|
"step": 1050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.809631824493408,
|
||
|
|
"epoch": 0.08863684099978995,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999997675505665,
|
||
|
|
"loss": 6.5493,
|
||
|
|
"mean_token_accuracy": 0.10353536382317544,
|
||
|
|
"num_tokens": 1944600.0,
|
||
|
|
"step": 1055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.8208941459655765,
|
||
|
|
"epoch": 0.08905692081495484,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999997225115052,
|
||
|
|
"loss": 6.7156,
|
||
|
|
"mean_token_accuracy": 0.10389059409499168,
|
||
|
|
"num_tokens": 1954234.0,
|
||
|
|
"step": 1060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.95792784690857,
|
||
|
|
"epoch": 0.08947700063011972,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999996734866896,
|
||
|
|
"loss": 6.677,
|
||
|
|
"mean_token_accuracy": 0.10057736709713935,
|
||
|
|
"num_tokens": 1964499.0,
|
||
|
|
"step": 1065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.662513589859008,
|
||
|
|
"epoch": 0.0898970804452846,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004999996204761206,
|
||
|
|
"loss": 6.3883,
|
||
|
|
"mean_token_accuracy": 0.11360553354024887,
|
||
|
|
"num_tokens": 1973635.0,
|
||
|
|
"step": 1070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.745052719116211,
|
||
|
|
"epoch": 0.09031716026044949,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004999995634797993,
|
||
|
|
"loss": 6.5278,
|
||
|
|
"mean_token_accuracy": 0.1087425634264946,
|
||
|
|
"num_tokens": 1983509.0,
|
||
|
|
"step": 1075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.769761800765991,
|
||
|
|
"epoch": 0.09073724007561437,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999995024977265,
|
||
|
|
"loss": 6.5385,
|
||
|
|
"mean_token_accuracy": 0.11216638460755349,
|
||
|
|
"num_tokens": 1992336.0,
|
||
|
|
"step": 1080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.855973386764527,
|
||
|
|
"epoch": 0.09115731989077924,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999994375299034,
|
||
|
|
"loss": 6.5509,
|
||
|
|
"mean_token_accuracy": 0.1137130968272686,
|
||
|
|
"num_tokens": 2001931.0,
|
||
|
|
"step": 1085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.615939617156982,
|
||
|
|
"epoch": 0.09157739970594413,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.000499999368576331,
|
||
|
|
"loss": 6.4174,
|
||
|
|
"mean_token_accuracy": 0.11283476129174233,
|
||
|
|
"num_tokens": 2010935.0,
|
||
|
|
"step": 1090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7152961730957035,
|
||
|
|
"epoch": 0.09199747952110901,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999992956370109,
|
||
|
|
"loss": 6.4684,
|
||
|
|
"mean_token_accuracy": 0.11342488676309585,
|
||
|
|
"num_tokens": 2020587.0,
|
||
|
|
"step": 1095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.688837385177612,
|
||
|
|
"epoch": 0.0924175593362739,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000499999218711944,
|
||
|
|
"loss": 6.5046,
|
||
|
|
"mean_token_accuracy": 0.10743609666824341,
|
||
|
|
"num_tokens": 2029743.0,
|
||
|
|
"step": 1100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.771305274963379,
|
||
|
|
"epoch": 0.09283763915143878,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999991378011317,
|
||
|
|
"loss": 6.5286,
|
||
|
|
"mean_token_accuracy": 0.11453117504715919,
|
||
|
|
"num_tokens": 2038468.0,
|
||
|
|
"step": 1105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.67022180557251,
|
||
|
|
"epoch": 0.09325771896660366,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999990529045757,
|
||
|
|
"loss": 6.4451,
|
||
|
|
"mean_token_accuracy": 0.11554965823888778,
|
||
|
|
"num_tokens": 2047456.0,
|
||
|
|
"step": 1110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.870058679580689,
|
||
|
|
"epoch": 0.09367779878176853,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999989640222771,
|
||
|
|
"loss": 6.7458,
|
||
|
|
"mean_token_accuracy": 0.09942527562379837,
|
||
|
|
"num_tokens": 2056691.0,
|
||
|
|
"step": 1115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.829685544967651,
|
||
|
|
"epoch": 0.09409787859693342,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499998871154238,
|
||
|
|
"loss": 6.5487,
|
||
|
|
"mean_token_accuracy": 0.10888865366578102,
|
||
|
|
"num_tokens": 2066068.0,
|
||
|
|
"step": 1120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.725253868103027,
|
||
|
|
"epoch": 0.0945179584120983,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999987743004597,
|
||
|
|
"loss": 6.4837,
|
||
|
|
"mean_token_accuracy": 0.11379996240139008,
|
||
|
|
"num_tokens": 2075113.0,
|
||
|
|
"step": 1125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7777934074401855,
|
||
|
|
"epoch": 0.09493803822726318,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004999986734609438,
|
||
|
|
"loss": 6.6044,
|
||
|
|
"mean_token_accuracy": 0.11070828661322593,
|
||
|
|
"num_tokens": 2084557.0,
|
||
|
|
"step": 1130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.817347526550293,
|
||
|
|
"epoch": 0.09535811804242807,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999985686356923,
|
||
|
|
"loss": 6.497,
|
||
|
|
"mean_token_accuracy": 0.10584703534841537,
|
||
|
|
"num_tokens": 2093424.0,
|
||
|
|
"step": 1135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.7462608337402346,
|
||
|
|
"epoch": 0.09577819785759294,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499998459824707,
|
||
|
|
"loss": 6.6329,
|
||
|
|
"mean_token_accuracy": 0.10303654298186302,
|
||
|
|
"num_tokens": 2103066.0,
|
||
|
|
"step": 1140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.799277830123901,
|
||
|
|
"epoch": 0.09619827767275782,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00049999834702799,
|
||
|
|
"loss": 6.5085,
|
||
|
|
"mean_token_accuracy": 0.11131441742181777,
|
||
|
|
"num_tokens": 2112447.0,
|
||
|
|
"step": 1145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.711055421829224,
|
||
|
|
"epoch": 0.0966183574879227,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004999982302455431,
|
||
|
|
"loss": 6.52,
|
||
|
|
"mean_token_accuracy": 0.11281892731785774,
|
||
|
|
"num_tokens": 2121949.0,
|
||
|
|
"step": 1150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.780323314666748,
|
||
|
|
"epoch": 0.09703843730308759,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999981094773683,
|
||
|
|
"loss": 6.4157,
|
||
|
|
"mean_token_accuracy": 0.1144998162984848,
|
||
|
|
"num_tokens": 2130464.0,
|
||
|
|
"step": 1155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.697625207901001,
|
||
|
|
"epoch": 0.09745851711825247,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.000499997984723468,
|
||
|
|
"loss": 6.5921,
|
||
|
|
"mean_token_accuracy": 0.1068018026649952,
|
||
|
|
"num_tokens": 2139577.0,
|
||
|
|
"step": 1160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.569090557098389,
|
||
|
|
"epoch": 0.09787859693341736,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004999978559838441,
|
||
|
|
"loss": 6.3121,
|
||
|
|
"mean_token_accuracy": 0.11300956755876541,
|
||
|
|
"num_tokens": 2147919.0,
|
||
|
|
"step": 1165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.716167974472046,
|
||
|
|
"epoch": 0.09829867674858223,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999977232584991,
|
||
|
|
"loss": 6.4791,
|
||
|
|
"mean_token_accuracy": 0.11262017637491226,
|
||
|
|
"num_tokens": 2156936.0,
|
||
|
|
"step": 1170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6336616516113285,
|
||
|
|
"epoch": 0.09871875656374711,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999975865474354,
|
||
|
|
"loss": 6.5492,
|
||
|
|
"mean_token_accuracy": 0.10994603037834168,
|
||
|
|
"num_tokens": 2165362.0,
|
||
|
|
"step": 1175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.719806575775147,
|
||
|
|
"epoch": 0.099138836378912,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.0004999974458506551,
|
||
|
|
"loss": 6.4705,
|
||
|
|
"mean_token_accuracy": 0.11214353889226913,
|
||
|
|
"num_tokens": 2173665.0,
|
||
|
|
"step": 1180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.786266422271728,
|
||
|
|
"epoch": 0.09955891619407688,
|
||
|
|
"grad_norm": 1.2578125,
|
||
|
|
"learning_rate": 0.000499997301168161,
|
||
|
|
"loss": 6.4531,
|
||
|
|
"mean_token_accuracy": 0.11377902403473854,
|
||
|
|
"num_tokens": 2182222.0,
|
||
|
|
"step": 1185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.670177459716797,
|
||
|
|
"epoch": 0.09997899600924176,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004999971524999556,
|
||
|
|
"loss": 6.528,
|
||
|
|
"mean_token_accuracy": 0.11228533461689949,
|
||
|
|
"num_tokens": 2192358.0,
|
||
|
|
"step": 1190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.779563045501709,
|
||
|
|
"epoch": 0.10039907582440663,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999969998460414,
|
||
|
|
"loss": 6.5039,
|
||
|
|
"mean_token_accuracy": 0.10956505164504052,
|
||
|
|
"num_tokens": 2201889.0,
|
||
|
|
"step": 1195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6560157299041744,
|
||
|
|
"epoch": 0.10081915563957151,
|
||
|
|
"grad_norm": 1.3359375,
|
||
|
|
"learning_rate": 0.0004999968432064213,
|
||
|
|
"loss": 6.5232,
|
||
|
|
"mean_token_accuracy": 0.11500915959477424,
|
||
|
|
"num_tokens": 2211810.0,
|
||
|
|
"step": 1200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.652071762084961,
|
||
|
|
"epoch": 0.1012392354547364,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004999966825810979,
|
||
|
|
"loss": 6.4474,
|
||
|
|
"mean_token_accuracy": 0.11259665861725807,
|
||
|
|
"num_tokens": 2221123.0,
|
||
|
|
"step": 1205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.634405040740967,
|
||
|
|
"epoch": 0.10165931526990128,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999965179700742,
|
||
|
|
"loss": 6.402,
|
||
|
|
"mean_token_accuracy": 0.1181789293885231,
|
||
|
|
"num_tokens": 2230129.0,
|
||
|
|
"step": 1210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.625933122634888,
|
||
|
|
"epoch": 0.10207939508506617,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499996349373353,
|
||
|
|
"loss": 6.4624,
|
||
|
|
"mean_token_accuracy": 0.11246607527136802,
|
||
|
|
"num_tokens": 2239929.0,
|
||
|
|
"step": 1215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.709180927276611,
|
||
|
|
"epoch": 0.10249947490023105,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999961767909374,
|
||
|
|
"loss": 6.4292,
|
||
|
|
"mean_token_accuracy": 0.11479318514466286,
|
||
|
|
"num_tokens": 2248078.0,
|
||
|
|
"step": 1220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.59263162612915,
|
||
|
|
"epoch": 0.10291955471539592,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999960002228303,
|
||
|
|
"loss": 6.5262,
|
||
|
|
"mean_token_accuracy": 0.11000767946243287,
|
||
|
|
"num_tokens": 2256975.0,
|
||
|
|
"step": 1225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.708470964431763,
|
||
|
|
"epoch": 0.1033396345305608,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004999958196690349,
|
||
|
|
"loss": 6.3792,
|
||
|
|
"mean_token_accuracy": 0.11624118462204933,
|
||
|
|
"num_tokens": 2265797.0,
|
||
|
|
"step": 1230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.645881128311157,
|
||
|
|
"epoch": 0.10375971434572569,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999956351295545,
|
||
|
|
"loss": 6.4736,
|
||
|
|
"mean_token_accuracy": 0.1176276110112667,
|
||
|
|
"num_tokens": 2274099.0,
|
||
|
|
"step": 1235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.599815797805786,
|
||
|
|
"epoch": 0.10417979416089057,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999954466043922,
|
||
|
|
"loss": 6.3853,
|
||
|
|
"mean_token_accuracy": 0.11810432821512222,
|
||
|
|
"num_tokens": 2282360.0,
|
||
|
|
"step": 1240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.57668776512146,
|
||
|
|
"epoch": 0.10459987397605545,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999952540935514,
|
||
|
|
"loss": 6.4891,
|
||
|
|
"mean_token_accuracy": 0.11048517748713493,
|
||
|
|
"num_tokens": 2292714.0,
|
||
|
|
"step": 1245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.675060033798218,
|
||
|
|
"epoch": 0.10501995379122034,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999950575970356,
|
||
|
|
"loss": 6.4361,
|
||
|
|
"mean_token_accuracy": 0.11576245203614235,
|
||
|
|
"num_tokens": 2301633.0,
|
||
|
|
"step": 1250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.642887592315674,
|
||
|
|
"epoch": 0.10544003360638521,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999948571148482,
|
||
|
|
"loss": 6.3931,
|
||
|
|
"mean_token_accuracy": 0.12049147412180901,
|
||
|
|
"num_tokens": 2310067.0,
|
||
|
|
"step": 1255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.610925579071045,
|
||
|
|
"epoch": 0.10586011342155009,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999946526469927,
|
||
|
|
"loss": 6.4927,
|
||
|
|
"mean_token_accuracy": 0.11412879601120948,
|
||
|
|
"num_tokens": 2320090.0,
|
||
|
|
"step": 1260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.649963521957398,
|
||
|
|
"epoch": 0.10628019323671498,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999944441934728,
|
||
|
|
"loss": 6.4451,
|
||
|
|
"mean_token_accuracy": 0.11852803751826287,
|
||
|
|
"num_tokens": 2329255.0,
|
||
|
|
"step": 1265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.678138732910156,
|
||
|
|
"epoch": 0.10670027305187986,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999942317542922,
|
||
|
|
"loss": 6.5261,
|
||
|
|
"mean_token_accuracy": 0.11407028958201408,
|
||
|
|
"num_tokens": 2339535.0,
|
||
|
|
"step": 1270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.635104560852051,
|
||
|
|
"epoch": 0.10712035286704474,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999940153294546,
|
||
|
|
"loss": 6.425,
|
||
|
|
"mean_token_accuracy": 0.11798783987760544,
|
||
|
|
"num_tokens": 2348948.0,
|
||
|
|
"step": 1275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.629437446594238,
|
||
|
|
"epoch": 0.10754043268220961,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000499993794918964,
|
||
|
|
"loss": 6.4518,
|
||
|
|
"mean_token_accuracy": 0.10851866900920867,
|
||
|
|
"num_tokens": 2359141.0,
|
||
|
|
"step": 1280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.612447357177734,
|
||
|
|
"epoch": 0.1079605124973745,
|
||
|
|
"grad_norm": 1.1875,
|
||
|
|
"learning_rate": 0.0004999935705228241,
|
||
|
|
"loss": 6.5007,
|
||
|
|
"mean_token_accuracy": 0.10988411605358124,
|
||
|
|
"num_tokens": 2368906.0,
|
||
|
|
"step": 1285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.720192527770996,
|
||
|
|
"epoch": 0.10838059231253938,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004999933421410389,
|
||
|
|
"loss": 6.4756,
|
||
|
|
"mean_token_accuracy": 0.11632761880755424,
|
||
|
|
"num_tokens": 2377029.0,
|
||
|
|
"step": 1290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.682251882553101,
|
||
|
|
"epoch": 0.10880067212770426,
|
||
|
|
"grad_norm": 0.84765625,
|
||
|
|
"learning_rate": 0.0004999931097736125,
|
||
|
|
"loss": 6.5226,
|
||
|
|
"mean_token_accuracy": 0.10841714516282082,
|
||
|
|
"num_tokens": 2387088.0,
|
||
|
|
"step": 1295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.616416501998901,
|
||
|
|
"epoch": 0.10922075194286915,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999928734205492,
|
||
|
|
"loss": 6.4358,
|
||
|
|
"mean_token_accuracy": 0.11085559725761414,
|
||
|
|
"num_tokens": 2395596.0,
|
||
|
|
"step": 1300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.630216932296753,
|
||
|
|
"epoch": 0.10964083175803403,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999926330818528,
|
||
|
|
"loss": 6.4278,
|
||
|
|
"mean_token_accuracy": 0.11868382543325424,
|
||
|
|
"num_tokens": 2404506.0,
|
||
|
|
"step": 1305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.615355587005615,
|
||
|
|
"epoch": 0.1100609115731989,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004999923887575278,
|
||
|
|
"loss": 6.4742,
|
||
|
|
"mean_token_accuracy": 0.11464583277702331,
|
||
|
|
"num_tokens": 2414342.0,
|
||
|
|
"step": 1310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.68165545463562,
|
||
|
|
"epoch": 0.11048099138836379,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999921404475785,
|
||
|
|
"loss": 6.4271,
|
||
|
|
"mean_token_accuracy": 0.11960532069206238,
|
||
|
|
"num_tokens": 2423076.0,
|
||
|
|
"step": 1315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.567938899993896,
|
||
|
|
"epoch": 0.11090107120352867,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004999918881520093,
|
||
|
|
"loss": 6.3809,
|
||
|
|
"mean_token_accuracy": 0.1204459622502327,
|
||
|
|
"num_tokens": 2432492.0,
|
||
|
|
"step": 1320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.610611057281494,
|
||
|
|
"epoch": 0.11132115101869355,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999916318708246,
|
||
|
|
"loss": 6.3447,
|
||
|
|
"mean_token_accuracy": 0.1213211365044117,
|
||
|
|
"num_tokens": 2441916.0,
|
||
|
|
"step": 1325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.550094270706177,
|
||
|
|
"epoch": 0.11174123083385844,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004999913716040291,
|
||
|
|
"loss": 6.4,
|
||
|
|
"mean_token_accuracy": 0.11803905665874481,
|
||
|
|
"num_tokens": 2450932.0,
|
||
|
|
"step": 1330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5825268745422365,
|
||
|
|
"epoch": 0.11216131064902331,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999911073516272,
|
||
|
|
"loss": 6.4156,
|
||
|
|
"mean_token_accuracy": 0.11501810997724533,
|
||
|
|
"num_tokens": 2460058.0,
|
||
|
|
"step": 1335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.541036558151245,
|
||
|
|
"epoch": 0.11258139046418819,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999908391136237,
|
||
|
|
"loss": 6.3486,
|
||
|
|
"mean_token_accuracy": 0.11862518936395645,
|
||
|
|
"num_tokens": 2469607.0,
|
||
|
|
"step": 1340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.54659481048584,
|
||
|
|
"epoch": 0.11300147027935308,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999905668900234,
|
||
|
|
"loss": 6.4037,
|
||
|
|
"mean_token_accuracy": 0.11429757624864578,
|
||
|
|
"num_tokens": 2478345.0,
|
||
|
|
"step": 1345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.665723133087158,
|
||
|
|
"epoch": 0.11342155009451796,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.000499990290680831,
|
||
|
|
"loss": 6.3362,
|
||
|
|
"mean_token_accuracy": 0.11939993128180504,
|
||
|
|
"num_tokens": 2486662.0,
|
||
|
|
"step": 1350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.539735174179077,
|
||
|
|
"epoch": 0.11384162990968284,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999900104860516,
|
||
|
|
"loss": 6.4496,
|
||
|
|
"mean_token_accuracy": 0.11450904607772827,
|
||
|
|
"num_tokens": 2495392.0,
|
||
|
|
"step": 1355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.640576314926148,
|
||
|
|
"epoch": 0.11426170972484773,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999897263056898,
|
||
|
|
"loss": 6.4824,
|
||
|
|
"mean_token_accuracy": 0.11427311152219773,
|
||
|
|
"num_tokens": 2505254.0,
|
||
|
|
"step": 1360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6059410572052,
|
||
|
|
"epoch": 0.1146817895400126,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000499989438139751,
|
||
|
|
"loss": 6.2902,
|
||
|
|
"mean_token_accuracy": 0.12163057401776314,
|
||
|
|
"num_tokens": 2514096.0,
|
||
|
|
"step": 1365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.572102785110474,
|
||
|
|
"epoch": 0.11510186935517748,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004999891459882401,
|
||
|
|
"loss": 6.3036,
|
||
|
|
"mean_token_accuracy": 0.12106614261865616,
|
||
|
|
"num_tokens": 2523635.0,
|
||
|
|
"step": 1370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.518535518646241,
|
||
|
|
"epoch": 0.11552194917034236,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999888498511624,
|
||
|
|
"loss": 6.3872,
|
||
|
|
"mean_token_accuracy": 0.117999816685915,
|
||
|
|
"num_tokens": 2532528.0,
|
||
|
|
"step": 1375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.522701168060303,
|
||
|
|
"epoch": 0.11594202898550725,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999885497285229,
|
||
|
|
"loss": 6.3026,
|
||
|
|
"mean_token_accuracy": 0.11809839084744453,
|
||
|
|
"num_tokens": 2541893.0,
|
||
|
|
"step": 1380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.516852188110351,
|
||
|
|
"epoch": 0.11636210880067213,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999882456203273,
|
||
|
|
"loss": 6.3627,
|
||
|
|
"mean_token_accuracy": 0.11867272853851318,
|
||
|
|
"num_tokens": 2551551.0,
|
||
|
|
"step": 1385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.592957019805908,
|
||
|
|
"epoch": 0.11678218861583702,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999879375265806,
|
||
|
|
"loss": 6.314,
|
||
|
|
"mean_token_accuracy": 0.1192450650036335,
|
||
|
|
"num_tokens": 2560183.0,
|
||
|
|
"step": 1390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.526823472976685,
|
||
|
|
"epoch": 0.11720226843100189,
|
||
|
|
"grad_norm": 1.1484375,
|
||
|
|
"learning_rate": 0.0004999876254472886,
|
||
|
|
"loss": 6.2065,
|
||
|
|
"mean_token_accuracy": 0.127345572412014,
|
||
|
|
"num_tokens": 2568697.0,
|
||
|
|
"step": 1395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.488171815872192,
|
||
|
|
"epoch": 0.11762234824616677,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999873093824565,
|
||
|
|
"loss": 6.4136,
|
||
|
|
"mean_token_accuracy": 0.1172497920691967,
|
||
|
|
"num_tokens": 2578151.0,
|
||
|
|
"step": 1400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.697162342071533,
|
||
|
|
"epoch": 0.11804242806133165,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004999869893320902,
|
||
|
|
"loss": 6.5415,
|
||
|
|
"mean_token_accuracy": 0.11695929765701293,
|
||
|
|
"num_tokens": 2585901.0,
|
||
|
|
"step": 1405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.558137512207031,
|
||
|
|
"epoch": 0.11846250787649654,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999866652961952,
|
||
|
|
"loss": 6.3565,
|
||
|
|
"mean_token_accuracy": 0.11195311546325684,
|
||
|
|
"num_tokens": 2595655.0,
|
||
|
|
"step": 1410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.547592639923096,
|
||
|
|
"epoch": 0.11888258769166142,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004999863372747773,
|
||
|
|
"loss": 6.3241,
|
||
|
|
"mean_token_accuracy": 0.1137452982366085,
|
||
|
|
"num_tokens": 2604949.0,
|
||
|
|
"step": 1415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.549184036254883,
|
||
|
|
"epoch": 0.11930266750682629,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999860052678423,
|
||
|
|
"loss": 6.3987,
|
||
|
|
"mean_token_accuracy": 0.12182095795869827,
|
||
|
|
"num_tokens": 2614260.0,
|
||
|
|
"step": 1420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.533220100402832,
|
||
|
|
"epoch": 0.11972274732199117,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999856692753959,
|
||
|
|
"loss": 6.3846,
|
||
|
|
"mean_token_accuracy": 0.11606933474540711,
|
||
|
|
"num_tokens": 2623740.0,
|
||
|
|
"step": 1425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.56026554107666,
|
||
|
|
"epoch": 0.12014282713715606,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999853292974444,
|
||
|
|
"loss": 6.2829,
|
||
|
|
"mean_token_accuracy": 0.1191012591123581,
|
||
|
|
"num_tokens": 2631998.0,
|
||
|
|
"step": 1430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.436700010299683,
|
||
|
|
"epoch": 0.12056290695232094,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999849853339936,
|
||
|
|
"loss": 6.4441,
|
||
|
|
"mean_token_accuracy": 0.12089451104402542,
|
||
|
|
"num_tokens": 2641169.0,
|
||
|
|
"step": 1435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.6503981590271,
|
||
|
|
"epoch": 0.12098298676748583,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004999846373850497,
|
||
|
|
"loss": 6.2726,
|
||
|
|
"mean_token_accuracy": 0.12328374907374381,
|
||
|
|
"num_tokens": 2650576.0,
|
||
|
|
"step": 1440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.504758834838867,
|
||
|
|
"epoch": 0.12140306658265071,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999842854506186,
|
||
|
|
"loss": 6.3597,
|
||
|
|
"mean_token_accuracy": 0.11508475914597512,
|
||
|
|
"num_tokens": 2660817.0,
|
||
|
|
"step": 1445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.454709720611572,
|
||
|
|
"epoch": 0.12182314639781558,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999839295307069,
|
||
|
|
"loss": 6.317,
|
||
|
|
"mean_token_accuracy": 0.11818674132227898,
|
||
|
|
"num_tokens": 2669338.0,
|
||
|
|
"step": 1450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5724732875823975,
|
||
|
|
"epoch": 0.12224322621298046,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999835696253206,
|
||
|
|
"loss": 6.3698,
|
||
|
|
"mean_token_accuracy": 0.11763316094875335,
|
||
|
|
"num_tokens": 2679108.0,
|
||
|
|
"step": 1455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.542471504211425,
|
||
|
|
"epoch": 0.12266330602814535,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004999832057344664,
|
||
|
|
"loss": 6.3312,
|
||
|
|
"mean_token_accuracy": 0.11857884675264359,
|
||
|
|
"num_tokens": 2688126.0,
|
||
|
|
"step": 1460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3690132141113285,
|
||
|
|
"epoch": 0.12308338584331023,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999828378581504,
|
||
|
|
"loss": 6.2827,
|
||
|
|
"mean_token_accuracy": 0.12631092369556426,
|
||
|
|
"num_tokens": 2697245.0,
|
||
|
|
"step": 1465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5668089389801025,
|
||
|
|
"epoch": 0.12350346565847511,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999824659963793,
|
||
|
|
"loss": 6.3543,
|
||
|
|
"mean_token_accuracy": 0.12048940360546112,
|
||
|
|
"num_tokens": 2705934.0,
|
||
|
|
"step": 1470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.516648006439209,
|
||
|
|
"epoch": 0.12392354547364,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999820901491598,
|
||
|
|
"loss": 6.2753,
|
||
|
|
"mean_token_accuracy": 0.12523386031389236,
|
||
|
|
"num_tokens": 2714367.0,
|
||
|
|
"step": 1475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.416815328598022,
|
||
|
|
"epoch": 0.12434362528880487,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999817103164983,
|
||
|
|
"loss": 6.3117,
|
||
|
|
"mean_token_accuracy": 0.12113343179225922,
|
||
|
|
"num_tokens": 2724366.0,
|
||
|
|
"step": 1480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.518594264984131,
|
||
|
|
"epoch": 0.12476370510396975,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004999813264984017,
|
||
|
|
"loss": 6.3262,
|
||
|
|
"mean_token_accuracy": 0.11913523152470588,
|
||
|
|
"num_tokens": 2733980.0,
|
||
|
|
"step": 1485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.520108652114868,
|
||
|
|
"epoch": 0.12518378491913462,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999809386948767,
|
||
|
|
"loss": 6.3232,
|
||
|
|
"mean_token_accuracy": 0.11875561475753785,
|
||
|
|
"num_tokens": 2744013.0,
|
||
|
|
"step": 1490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4508843421936035,
|
||
|
|
"epoch": 0.12560386473429952,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004999805469059302,
|
||
|
|
"loss": 6.3917,
|
||
|
|
"mean_token_accuracy": 0.1202739343047142,
|
||
|
|
"num_tokens": 2753385.0,
|
||
|
|
"step": 1495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.467165565490722,
|
||
|
|
"epoch": 0.1260239445494644,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999801511315693,
|
||
|
|
"loss": 6.2443,
|
||
|
|
"mean_token_accuracy": 0.11950960382819176,
|
||
|
|
"num_tokens": 2762875.0,
|
||
|
|
"step": 1500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.561000490188599,
|
||
|
|
"epoch": 0.1264440243646293,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999797513718007,
|
||
|
|
"loss": 6.3133,
|
||
|
|
"mean_token_accuracy": 0.12554540634155273,
|
||
|
|
"num_tokens": 2772182.0,
|
||
|
|
"step": 1505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.398244476318359,
|
||
|
|
"epoch": 0.12686410417979416,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999793476266317,
|
||
|
|
"loss": 6.2652,
|
||
|
|
"mean_token_accuracy": 0.12494927272200584,
|
||
|
|
"num_tokens": 2780814.0,
|
||
|
|
"step": 1510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.759689664840698,
|
||
|
|
"epoch": 0.12728418399495905,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999789398960695,
|
||
|
|
"loss": 6.5371,
|
||
|
|
"mean_token_accuracy": 0.120218076556921,
|
||
|
|
"num_tokens": 2791104.0,
|
||
|
|
"step": 1515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.380699729919433,
|
||
|
|
"epoch": 0.12770426381012392,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999785281801212,
|
||
|
|
"loss": 6.2392,
|
||
|
|
"mean_token_accuracy": 0.12141881808638573,
|
||
|
|
"num_tokens": 2800081.0,
|
||
|
|
"step": 1520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.502162122726441,
|
||
|
|
"epoch": 0.1281243436252888,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499978112478794,
|
||
|
|
"loss": 6.3645,
|
||
|
|
"mean_token_accuracy": 0.11820052862167359,
|
||
|
|
"num_tokens": 2809096.0,
|
||
|
|
"step": 1525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.559705686569214,
|
||
|
|
"epoch": 0.1285444234404537,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004999776927920955,
|
||
|
|
"loss": 6.3324,
|
||
|
|
"mean_token_accuracy": 0.12376131415367127,
|
||
|
|
"num_tokens": 2818857.0,
|
||
|
|
"step": 1530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.478033876419067,
|
||
|
|
"epoch": 0.12896450325561856,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499977269120033,
|
||
|
|
"loss": 6.3924,
|
||
|
|
"mean_token_accuracy": 0.11640017554163933,
|
||
|
|
"num_tokens": 2829332.0,
|
||
|
|
"step": 1535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.471277475357056,
|
||
|
|
"epoch": 0.12938458307078346,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000499976841462614,
|
||
|
|
"loss": 6.3118,
|
||
|
|
"mean_token_accuracy": 0.11578154116868973,
|
||
|
|
"num_tokens": 2839193.0,
|
||
|
|
"step": 1540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.515983152389526,
|
||
|
|
"epoch": 0.12980466288594833,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.000499976409819846,
|
||
|
|
"loss": 6.3126,
|
||
|
|
"mean_token_accuracy": 0.1165178470313549,
|
||
|
|
"num_tokens": 2848535.0,
|
||
|
|
"step": 1545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.329218864440918,
|
||
|
|
"epoch": 0.1302247427011132,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004999759741917369,
|
||
|
|
"loss": 6.2119,
|
||
|
|
"mean_token_accuracy": 0.12768493369221687,
|
||
|
|
"num_tokens": 2858090.0,
|
||
|
|
"step": 1550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4847986698150635,
|
||
|
|
"epoch": 0.1306448225162781,
|
||
|
|
"grad_norm": 1.1640625,
|
||
|
|
"learning_rate": 0.0004999755345782941,
|
||
|
|
"loss": 6.3672,
|
||
|
|
"mean_token_accuracy": 0.1186487466096878,
|
||
|
|
"num_tokens": 2866984.0,
|
||
|
|
"step": 1555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.419411611557007,
|
||
|
|
"epoch": 0.13106490233144297,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004999750909795256,
|
||
|
|
"loss": 6.1757,
|
||
|
|
"mean_token_accuracy": 0.1280258044600487,
|
||
|
|
"num_tokens": 2876550.0,
|
||
|
|
"step": 1560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.461032104492188,
|
||
|
|
"epoch": 0.13148498214660786,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999746433954394,
|
||
|
|
"loss": 6.2774,
|
||
|
|
"mean_token_accuracy": 0.1213872842490673,
|
||
|
|
"num_tokens": 2885782.0,
|
||
|
|
"step": 1565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.447916793823242,
|
||
|
|
"epoch": 0.13190506196177273,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000499974191826043,
|
||
|
|
"loss": 6.2448,
|
||
|
|
"mean_token_accuracy": 0.13687582612037658,
|
||
|
|
"num_tokens": 2894807.0,
|
||
|
|
"step": 1570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.439778518676758,
|
||
|
|
"epoch": 0.1323251417769376,
|
||
|
|
"grad_norm": 1.171875,
|
||
|
|
"learning_rate": 0.0004999737362713448,
|
||
|
|
"loss": 6.2925,
|
||
|
|
"mean_token_accuracy": 0.1238982230424881,
|
||
|
|
"num_tokens": 2904076.0,
|
||
|
|
"step": 1575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.471430492401123,
|
||
|
|
"epoch": 0.1327452215921025,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999732767313527,
|
||
|
|
"loss": 6.2033,
|
||
|
|
"mean_token_accuracy": 0.1205870471894741,
|
||
|
|
"num_tokens": 2913761.0,
|
||
|
|
"step": 1580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.509069633483887,
|
||
|
|
"epoch": 0.13316530140726737,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004999728132060746,
|
||
|
|
"loss": 6.4228,
|
||
|
|
"mean_token_accuracy": 0.12286271527409554,
|
||
|
|
"num_tokens": 2922848.0,
|
||
|
|
"step": 1585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.5165454864501955,
|
||
|
|
"epoch": 0.13358538122243227,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004999723456955192,
|
||
|
|
"loss": 6.3079,
|
||
|
|
"mean_token_accuracy": 0.11906806230545045,
|
||
|
|
"num_tokens": 2932718.0,
|
||
|
|
"step": 1590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.353040504455566,
|
||
|
|
"epoch": 0.13400546103759714,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999718741996945,
|
||
|
|
"loss": 6.2648,
|
||
|
|
"mean_token_accuracy": 0.12362491562962533,
|
||
|
|
"num_tokens": 2942686.0,
|
||
|
|
"step": 1595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.480581188201905,
|
||
|
|
"epoch": 0.13442554085276204,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.000499971398718609,
|
||
|
|
"loss": 6.2304,
|
||
|
|
"mean_token_accuracy": 0.12233746945858001,
|
||
|
|
"num_tokens": 2952096.0,
|
||
|
|
"step": 1600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.41249566078186,
|
||
|
|
"epoch": 0.1348456206679269,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999709192522708,
|
||
|
|
"loss": 6.3139,
|
||
|
|
"mean_token_accuracy": 0.12512291446328164,
|
||
|
|
"num_tokens": 2960660.0,
|
||
|
|
"step": 1605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.536613845825196,
|
||
|
|
"epoch": 0.13526570048309178,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004999704358006887,
|
||
|
|
"loss": 6.3118,
|
||
|
|
"mean_token_accuracy": 0.12129077091813087,
|
||
|
|
"num_tokens": 2969834.0,
|
||
|
|
"step": 1610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4085368633270265,
|
||
|
|
"epoch": 0.13568578029825668,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004999699483638712,
|
||
|
|
"loss": 6.2906,
|
||
|
|
"mean_token_accuracy": 0.12232841104269028,
|
||
|
|
"num_tokens": 2979023.0,
|
||
|
|
"step": 1615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.476312971115112,
|
||
|
|
"epoch": 0.13610586011342155,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999694569418269,
|
||
|
|
"loss": 6.2964,
|
||
|
|
"mean_token_accuracy": 0.12233099341392517,
|
||
|
|
"num_tokens": 2988083.0,
|
||
|
|
"step": 1620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.359239149093628,
|
||
|
|
"epoch": 0.13652593992858644,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999689615345645,
|
||
|
|
"loss": 6.2196,
|
||
|
|
"mean_token_accuracy": 0.12490532472729683,
|
||
|
|
"num_tokens": 2997240.0,
|
||
|
|
"step": 1625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.505274820327759,
|
||
|
|
"epoch": 0.1369460197437513,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999684621420928,
|
||
|
|
"loss": 6.2805,
|
||
|
|
"mean_token_accuracy": 0.12174654453992843,
|
||
|
|
"num_tokens": 3007077.0,
|
||
|
|
"step": 1630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.501539659500122,
|
||
|
|
"epoch": 0.13736609955891618,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999679587644205,
|
||
|
|
"loss": 6.3282,
|
||
|
|
"mean_token_accuracy": 0.11869422942399979,
|
||
|
|
"num_tokens": 3015821.0,
|
||
|
|
"step": 1635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.434766483306885,
|
||
|
|
"epoch": 0.13778617937408108,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999674514015568,
|
||
|
|
"loss": 6.2508,
|
||
|
|
"mean_token_accuracy": 0.1246812529861927,
|
||
|
|
"num_tokens": 3025858.0,
|
||
|
|
"step": 1640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.406217813491821,
|
||
|
|
"epoch": 0.13820625918924595,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999669400535105,
|
||
|
|
"loss": 6.2132,
|
||
|
|
"mean_token_accuracy": 0.12023203670978547,
|
||
|
|
"num_tokens": 3035537.0,
|
||
|
|
"step": 1645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.359542560577393,
|
||
|
|
"epoch": 0.13862633900441085,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999664247202907,
|
||
|
|
"loss": 6.152,
|
||
|
|
"mean_token_accuracy": 0.12406394928693772,
|
||
|
|
"num_tokens": 3044204.0,
|
||
|
|
"step": 1650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.404636430740356,
|
||
|
|
"epoch": 0.13904641881957572,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999659054019066,
|
||
|
|
"loss": 6.2994,
|
||
|
|
"mean_token_accuracy": 0.12448503151535988,
|
||
|
|
"num_tokens": 3053111.0,
|
||
|
|
"step": 1655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.443476963043213,
|
||
|
|
"epoch": 0.1394664986347406,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999653820983673,
|
||
|
|
"loss": 6.2201,
|
||
|
|
"mean_token_accuracy": 0.12843194082379342,
|
||
|
|
"num_tokens": 3062456.0,
|
||
|
|
"step": 1660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.356498098373413,
|
||
|
|
"epoch": 0.13988657844990549,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.000499964854809682,
|
||
|
|
"loss": 6.2579,
|
||
|
|
"mean_token_accuracy": 0.12453076243400574,
|
||
|
|
"num_tokens": 3071132.0,
|
||
|
|
"step": 1665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.388091611862182,
|
||
|
|
"epoch": 0.14030665826507036,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999643235358602,
|
||
|
|
"loss": 6.2078,
|
||
|
|
"mean_token_accuracy": 0.12833356559276582,
|
||
|
|
"num_tokens": 3080892.0,
|
||
|
|
"step": 1670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.392906522750854,
|
||
|
|
"epoch": 0.14072673808023525,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999637882769112,
|
||
|
|
"loss": 6.1429,
|
||
|
|
"mean_token_accuracy": 0.12803655937314035,
|
||
|
|
"num_tokens": 3089874.0,
|
||
|
|
"step": 1675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.369514799118042,
|
||
|
|
"epoch": 0.14114681789540012,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004999632490328447,
|
||
|
|
"loss": 6.2814,
|
||
|
|
"mean_token_accuracy": 0.12487674206495285,
|
||
|
|
"num_tokens": 3099535.0,
|
||
|
|
"step": 1680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.432224130630493,
|
||
|
|
"epoch": 0.14156689771056502,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004999627058036699,
|
||
|
|
"loss": 6.24,
|
||
|
|
"mean_token_accuracy": 0.12075779214501381,
|
||
|
|
"num_tokens": 3108772.0,
|
||
|
|
"step": 1685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.430401134490967,
|
||
|
|
"epoch": 0.1419869775257299,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999621585893966,
|
||
|
|
"loss": 6.2696,
|
||
|
|
"mean_token_accuracy": 0.11704754754900933,
|
||
|
|
"num_tokens": 3118333.0,
|
||
|
|
"step": 1690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.450057506561279,
|
||
|
|
"epoch": 0.14240705734089476,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999616073900346,
|
||
|
|
"loss": 6.3013,
|
||
|
|
"mean_token_accuracy": 0.12180939391255378,
|
||
|
|
"num_tokens": 3127356.0,
|
||
|
|
"step": 1695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.412153673171997,
|
||
|
|
"epoch": 0.14282713715605966,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004999610522055935,
|
||
|
|
"loss": 6.2662,
|
||
|
|
"mean_token_accuracy": 0.1200573742389679,
|
||
|
|
"num_tokens": 3136859.0,
|
||
|
|
"step": 1700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.451931762695312,
|
||
|
|
"epoch": 0.14324721697122453,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999604930360832,
|
||
|
|
"loss": 6.2945,
|
||
|
|
"mean_token_accuracy": 0.12161469012498856,
|
||
|
|
"num_tokens": 3146607.0,
|
||
|
|
"step": 1705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3816108226776125,
|
||
|
|
"epoch": 0.14366729678638943,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004999599298815136,
|
||
|
|
"loss": 6.2364,
|
||
|
|
"mean_token_accuracy": 0.12764545828104018,
|
||
|
|
"num_tokens": 3156327.0,
|
||
|
|
"step": 1710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.309280204772949,
|
||
|
|
"epoch": 0.1440873766015543,
|
||
|
|
"grad_norm": 1.5390625,
|
||
|
|
"learning_rate": 0.0004999593627418947,
|
||
|
|
"loss": 6.177,
|
||
|
|
"mean_token_accuracy": 0.13247063681483268,
|
||
|
|
"num_tokens": 3165559.0,
|
||
|
|
"step": 1715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.405248212814331,
|
||
|
|
"epoch": 0.14450745641671917,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999587916172365,
|
||
|
|
"loss": 6.2704,
|
||
|
|
"mean_token_accuracy": 0.1183898076415062,
|
||
|
|
"num_tokens": 3173850.0,
|
||
|
|
"step": 1720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.435620069503784,
|
||
|
|
"epoch": 0.14492753623188406,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999582165075492,
|
||
|
|
"loss": 6.22,
|
||
|
|
"mean_token_accuracy": 0.11956866905093193,
|
||
|
|
"num_tokens": 3182838.0,
|
||
|
|
"step": 1725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2884269714355465,
|
||
|
|
"epoch": 0.14534761604704893,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999576374128429,
|
||
|
|
"loss": 6.202,
|
||
|
|
"mean_token_accuracy": 0.1219302274286747,
|
||
|
|
"num_tokens": 3191692.0,
|
||
|
|
"step": 1730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.500776195526123,
|
||
|
|
"epoch": 0.14576769586221383,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999570543331279,
|
||
|
|
"loss": 6.226,
|
||
|
|
"mean_token_accuracy": 0.1263854332268238,
|
||
|
|
"num_tokens": 3200069.0,
|
||
|
|
"step": 1735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.411444854736328,
|
||
|
|
"epoch": 0.1461877756773787,
|
||
|
|
"grad_norm": 1.140625,
|
||
|
|
"learning_rate": 0.0004999564672684145,
|
||
|
|
"loss": 6.3228,
|
||
|
|
"mean_token_accuracy": 0.12090336456894875,
|
||
|
|
"num_tokens": 3209653.0,
|
||
|
|
"step": 1740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.448664712905884,
|
||
|
|
"epoch": 0.14660785549254357,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999558762187131,
|
||
|
|
"loss": 6.1938,
|
||
|
|
"mean_token_accuracy": 0.12701231315732003,
|
||
|
|
"num_tokens": 3218313.0,
|
||
|
|
"step": 1745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.32896614074707,
|
||
|
|
"epoch": 0.14702793530770847,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999552811840342,
|
||
|
|
"loss": 6.1297,
|
||
|
|
"mean_token_accuracy": 0.12769370079040526,
|
||
|
|
"num_tokens": 3227525.0,
|
||
|
|
"step": 1750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.335414171218872,
|
||
|
|
"epoch": 0.14744801512287334,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004999546821643884,
|
||
|
|
"loss": 6.2408,
|
||
|
|
"mean_token_accuracy": 0.12636618986725806,
|
||
|
|
"num_tokens": 3237022.0,
|
||
|
|
"step": 1755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.317769384384155,
|
||
|
|
"epoch": 0.14786809493803824,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999540791597861,
|
||
|
|
"loss": 6.1464,
|
||
|
|
"mean_token_accuracy": 0.12537204548716546,
|
||
|
|
"num_tokens": 3246605.0,
|
||
|
|
"step": 1760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.258312511444092,
|
||
|
|
"epoch": 0.1482881747532031,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999534721702383,
|
||
|
|
"loss": 6.0956,
|
||
|
|
"mean_token_accuracy": 0.13141294568777084,
|
||
|
|
"num_tokens": 3255587.0,
|
||
|
|
"step": 1765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.364277791976929,
|
||
|
|
"epoch": 0.148708254568368,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999528611957553,
|
||
|
|
"loss": 6.1968,
|
||
|
|
"mean_token_accuracy": 0.1267327442765236,
|
||
|
|
"num_tokens": 3265669.0,
|
||
|
|
"step": 1770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.433037424087525,
|
||
|
|
"epoch": 0.14912833438353287,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999522462363485,
|
||
|
|
"loss": 6.1795,
|
||
|
|
"mean_token_accuracy": 0.12822128161787988,
|
||
|
|
"num_tokens": 3275013.0,
|
||
|
|
"step": 1775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.372742748260498,
|
||
|
|
"epoch": 0.14954841419869774,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004999516272920283,
|
||
|
|
"loss": 6.2775,
|
||
|
|
"mean_token_accuracy": 0.12774404734373093,
|
||
|
|
"num_tokens": 3284723.0,
|
||
|
|
"step": 1780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.256136322021485,
|
||
|
|
"epoch": 0.14996849401386264,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000499951004362806,
|
||
|
|
"loss": 6.1087,
|
||
|
|
"mean_token_accuracy": 0.13196263536810876,
|
||
|
|
"num_tokens": 3293860.0,
|
||
|
|
"step": 1785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.278848552703858,
|
||
|
|
"epoch": 0.1503885738290275,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999503774486924,
|
||
|
|
"loss": 6.1623,
|
||
|
|
"mean_token_accuracy": 0.13007338494062423,
|
||
|
|
"num_tokens": 3303158.0,
|
||
|
|
"step": 1790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.253765487670899,
|
||
|
|
"epoch": 0.1508086536441924,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004999497465496987,
|
||
|
|
"loss": 6.1083,
|
||
|
|
"mean_token_accuracy": 0.1231241799890995,
|
||
|
|
"num_tokens": 3313068.0,
|
||
|
|
"step": 1795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.319281959533692,
|
||
|
|
"epoch": 0.15122873345935728,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.000499949111665836,
|
||
|
|
"loss": 6.1761,
|
||
|
|
"mean_token_accuracy": 0.12510209009051323,
|
||
|
|
"num_tokens": 3321885.0,
|
||
|
|
"step": 1800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.368197298049926,
|
||
|
|
"epoch": 0.15164881327452215,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999484727971158,
|
||
|
|
"loss": 6.1707,
|
||
|
|
"mean_token_accuracy": 0.12798358947038652,
|
||
|
|
"num_tokens": 3330924.0,
|
||
|
|
"step": 1805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.339307403564453,
|
||
|
|
"epoch": 0.15206889308968705,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499947829943549,
|
||
|
|
"loss": 6.1964,
|
||
|
|
"mean_token_accuracy": 0.12618306949734687,
|
||
|
|
"num_tokens": 3340070.0,
|
||
|
|
"step": 1810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.394219160079956,
|
||
|
|
"epoch": 0.15248897290485192,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004999471831051474,
|
||
|
|
"loss": 6.1922,
|
||
|
|
"mean_token_accuracy": 0.13684661015868188,
|
||
|
|
"num_tokens": 3349870.0,
|
||
|
|
"step": 1815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.330759143829345,
|
||
|
|
"epoch": 0.1529090527200168,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004999465322819222,
|
||
|
|
"loss": 6.2371,
|
||
|
|
"mean_token_accuracy": 0.12111249193549156,
|
||
|
|
"num_tokens": 3359573.0,
|
||
|
|
"step": 1820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.372816276550293,
|
||
|
|
"epoch": 0.15332913253518168,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999458774738851,
|
||
|
|
"loss": 6.1732,
|
||
|
|
"mean_token_accuracy": 0.13470285311341285,
|
||
|
|
"num_tokens": 3368577.0,
|
||
|
|
"step": 1825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.352361059188842,
|
||
|
|
"epoch": 0.15374921235034655,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999452186810476,
|
||
|
|
"loss": 6.1469,
|
||
|
|
"mean_token_accuracy": 0.13113251850008964,
|
||
|
|
"num_tokens": 3377801.0,
|
||
|
|
"step": 1830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3680521011352536,
|
||
|
|
"epoch": 0.15416929216551145,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999445559034214,
|
||
|
|
"loss": 6.1995,
|
||
|
|
"mean_token_accuracy": 0.12895982414484025,
|
||
|
|
"num_tokens": 3386666.0,
|
||
|
|
"step": 1835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.443807363510132,
|
||
|
|
"epoch": 0.15458937198067632,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999438891410181,
|
||
|
|
"loss": 6.3344,
|
||
|
|
"mean_token_accuracy": 0.12429568618535995,
|
||
|
|
"num_tokens": 3396086.0,
|
||
|
|
"step": 1840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.371559190750122,
|
||
|
|
"epoch": 0.15500945179584122,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999432183938496,
|
||
|
|
"loss": 6.2503,
|
||
|
|
"mean_token_accuracy": 0.1258139818906784,
|
||
|
|
"num_tokens": 3404894.0,
|
||
|
|
"step": 1845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.40411787033081,
|
||
|
|
"epoch": 0.1554295316110061,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999425436619279,
|
||
|
|
"loss": 6.2301,
|
||
|
|
"mean_token_accuracy": 0.1250107169151306,
|
||
|
|
"num_tokens": 3414172.0,
|
||
|
|
"step": 1850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4263053894042965,
|
||
|
|
"epoch": 0.15584961142617096,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.000499941864945265,
|
||
|
|
"loss": 6.2069,
|
||
|
|
"mean_token_accuracy": 0.12341500893235206,
|
||
|
|
"num_tokens": 3423409.0,
|
||
|
|
"step": 1855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2579625129699705,
|
||
|
|
"epoch": 0.15626969124133586,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004999411822438726,
|
||
|
|
"loss": 6.1554,
|
||
|
|
"mean_token_accuracy": 0.12717969343066216,
|
||
|
|
"num_tokens": 3433047.0,
|
||
|
|
"step": 1860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.4037513256073,
|
||
|
|
"epoch": 0.15668977105650073,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000499940495557763,
|
||
|
|
"loss": 6.1468,
|
||
|
|
"mean_token_accuracy": 0.12783457711338997,
|
||
|
|
"num_tokens": 3442490.0,
|
||
|
|
"step": 1865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.303406810760498,
|
||
|
|
"epoch": 0.15710985087166562,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999398048869485,
|
||
|
|
"loss": 6.2099,
|
||
|
|
"mean_token_accuracy": 0.129954195022583,
|
||
|
|
"num_tokens": 3451804.0,
|
||
|
|
"step": 1870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.385490417480469,
|
||
|
|
"epoch": 0.1575299306868305,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000499939110231441,
|
||
|
|
"loss": 6.199,
|
||
|
|
"mean_token_accuracy": 0.1304432988166809,
|
||
|
|
"num_tokens": 3461481.0,
|
||
|
|
"step": 1875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.364220190048218,
|
||
|
|
"epoch": 0.1579500105019954,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999384115912531,
|
||
|
|
"loss": 6.2449,
|
||
|
|
"mean_token_accuracy": 0.13135363310575485,
|
||
|
|
"num_tokens": 3471798.0,
|
||
|
|
"step": 1880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.247316694259643,
|
||
|
|
"epoch": 0.15837009031716026,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.000499937708966397,
|
||
|
|
"loss": 6.1296,
|
||
|
|
"mean_token_accuracy": 0.12637364491820335,
|
||
|
|
"num_tokens": 3481386.0,
|
||
|
|
"step": 1885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.332306051254273,
|
||
|
|
"epoch": 0.15879017013232513,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004999370023568853,
|
||
|
|
"loss": 6.127,
|
||
|
|
"mean_token_accuracy": 0.1316571466624737,
|
||
|
|
"num_tokens": 3489981.0,
|
||
|
|
"step": 1890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.299954462051391,
|
||
|
|
"epoch": 0.15921024994749003,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999362917627304,
|
||
|
|
"loss": 6.1227,
|
||
|
|
"mean_token_accuracy": 0.1305247150361538,
|
||
|
|
"num_tokens": 3498551.0,
|
||
|
|
"step": 1895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.316105461120605,
|
||
|
|
"epoch": 0.1596303297626549,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999355771839448,
|
||
|
|
"loss": 6.0979,
|
||
|
|
"mean_token_accuracy": 0.12954429015517235,
|
||
|
|
"num_tokens": 3507921.0,
|
||
|
|
"step": 1900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.470440483093261,
|
||
|
|
"epoch": 0.1600504095778198,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004999348586205414,
|
||
|
|
"loss": 6.2729,
|
||
|
|
"mean_token_accuracy": 0.13220328316092492,
|
||
|
|
"num_tokens": 3517570.0,
|
||
|
|
"step": 1905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.38808388710022,
|
||
|
|
"epoch": 0.16047048939298467,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004999341360725327,
|
||
|
|
"loss": 6.2438,
|
||
|
|
"mean_token_accuracy": 0.123927091807127,
|
||
|
|
"num_tokens": 3526774.0,
|
||
|
|
"step": 1910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.285849714279175,
|
||
|
|
"epoch": 0.16089056920814954,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999334095399317,
|
||
|
|
"loss": 6.1859,
|
||
|
|
"mean_token_accuracy": 0.1361298866569996,
|
||
|
|
"num_tokens": 3535319.0,
|
||
|
|
"step": 1915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.249746656417846,
|
||
|
|
"epoch": 0.16131064902331443,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999326790227512,
|
||
|
|
"loss": 6.1605,
|
||
|
|
"mean_token_accuracy": 0.1271871216595173,
|
||
|
|
"num_tokens": 3544468.0,
|
||
|
|
"step": 1920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.217294788360595,
|
||
|
|
"epoch": 0.1617307288384793,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004999319445210041,
|
||
|
|
"loss": 6.0261,
|
||
|
|
"mean_token_accuracy": 0.1361843690276146,
|
||
|
|
"num_tokens": 3553529.0,
|
||
|
|
"step": 1925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.290815734863282,
|
||
|
|
"epoch": 0.1621508086536442,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999312060347034,
|
||
|
|
"loss": 6.1011,
|
||
|
|
"mean_token_accuracy": 0.13233864828944206,
|
||
|
|
"num_tokens": 3563053.0,
|
||
|
|
"step": 1930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.224975728988648,
|
||
|
|
"epoch": 0.16257088846880907,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004999304635638621,
|
||
|
|
"loss": 6.0288,
|
||
|
|
"mean_token_accuracy": 0.1342104844748974,
|
||
|
|
"num_tokens": 3571877.0,
|
||
|
|
"step": 1935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.233099460601807,
|
||
|
|
"epoch": 0.16299096828397394,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004999297171084935,
|
||
|
|
"loss": 6.091,
|
||
|
|
"mean_token_accuracy": 0.13373700231313707,
|
||
|
|
"num_tokens": 3581496.0,
|
||
|
|
"step": 1940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.324843549728394,
|
||
|
|
"epoch": 0.16341104809913884,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999289666686109,
|
||
|
|
"loss": 6.1071,
|
||
|
|
"mean_token_accuracy": 0.1308230109512806,
|
||
|
|
"num_tokens": 3590752.0,
|
||
|
|
"step": 1945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.129473495483398,
|
||
|
|
"epoch": 0.1638311279143037,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004999282122442274,
|
||
|
|
"loss": 6.1072,
|
||
|
|
"mean_token_accuracy": 0.1328013814985752,
|
||
|
|
"num_tokens": 3599885.0,
|
||
|
|
"step": 1950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.387533235549927,
|
||
|
|
"epoch": 0.1642512077294686,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004999274538353564,
|
||
|
|
"loss": 6.1968,
|
||
|
|
"mean_token_accuracy": 0.12293331325054169,
|
||
|
|
"num_tokens": 3610039.0,
|
||
|
|
"step": 1955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2677867889404295,
|
||
|
|
"epoch": 0.16467128754463348,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999266914420114,
|
||
|
|
"loss": 6.1123,
|
||
|
|
"mean_token_accuracy": 0.12491545528173446,
|
||
|
|
"num_tokens": 3619954.0,
|
||
|
|
"step": 1960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.291842746734619,
|
||
|
|
"epoch": 0.16509136735979837,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499925925064206,
|
||
|
|
"loss": 6.0646,
|
||
|
|
"mean_token_accuracy": 0.13617814630270003,
|
||
|
|
"num_tokens": 3628164.0,
|
||
|
|
"step": 1965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.377547359466552,
|
||
|
|
"epoch": 0.16551144717496324,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999251547019535,
|
||
|
|
"loss": 6.2126,
|
||
|
|
"mean_token_accuracy": 0.13370679765939714,
|
||
|
|
"num_tokens": 3636778.0,
|
||
|
|
"step": 1970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.318364191055298,
|
||
|
|
"epoch": 0.16593152699012811,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004999243803552678,
|
||
|
|
"loss": 6.1666,
|
||
|
|
"mean_token_accuracy": 0.13474627435207367,
|
||
|
|
"num_tokens": 3647046.0,
|
||
|
|
"step": 1975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2661604404449465,
|
||
|
|
"epoch": 0.166351606805293,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004999236020241625,
|
||
|
|
"loss": 6.0969,
|
||
|
|
"mean_token_accuracy": 0.1302388660609722,
|
||
|
|
"num_tokens": 3656130.0,
|
||
|
|
"step": 1980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.294794940948487,
|
||
|
|
"epoch": 0.16677168662045788,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004999228197086514,
|
||
|
|
"loss": 6.1791,
|
||
|
|
"mean_token_accuracy": 0.12147556319832802,
|
||
|
|
"num_tokens": 3666145.0,
|
||
|
|
"step": 1985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.308886766433716,
|
||
|
|
"epoch": 0.16719176643562278,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004999220334087484,
|
||
|
|
"loss": 6.2221,
|
||
|
|
"mean_token_accuracy": 0.12820759564638137,
|
||
|
|
"num_tokens": 3676722.0,
|
||
|
|
"step": 1990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.34148588180542,
|
||
|
|
"epoch": 0.16761184625078765,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004999212431244673,
|
||
|
|
"loss": 6.1977,
|
||
|
|
"mean_token_accuracy": 0.1265730917453766,
|
||
|
|
"num_tokens": 3685880.0,
|
||
|
|
"step": 1995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.220745372772217,
|
||
|
|
"epoch": 0.16803192606595252,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999204488558222,
|
||
|
|
"loss": 6.0332,
|
||
|
|
"mean_token_accuracy": 0.13368572890758515,
|
||
|
|
"num_tokens": 3695167.0,
|
||
|
|
"step": 2000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.279938268661499,
|
||
|
|
"epoch": 0.16845200588111742,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004999196506028273,
|
||
|
|
"loss": 6.1455,
|
||
|
|
"mean_token_accuracy": 0.12803823873400688,
|
||
|
|
"num_tokens": 3703700.0,
|
||
|
|
"step": 2005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.340878582000732,
|
||
|
|
"epoch": 0.1688720856962823,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004999188483654965,
|
||
|
|
"loss": 6.0938,
|
||
|
|
"mean_token_accuracy": 0.12776080071926116,
|
||
|
|
"num_tokens": 3712825.0,
|
||
|
|
"step": 2010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.229676914215088,
|
||
|
|
"epoch": 0.16929216551144718,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004999180421438442,
|
||
|
|
"loss": 6.0447,
|
||
|
|
"mean_token_accuracy": 0.13442618474364282,
|
||
|
|
"num_tokens": 3721807.0,
|
||
|
|
"step": 2015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3377564430236815,
|
||
|
|
"epoch": 0.16971224532661205,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999172319378846,
|
||
|
|
"loss": 6.2308,
|
||
|
|
"mean_token_accuracy": 0.12342165559530258,
|
||
|
|
"num_tokens": 3730502.0,
|
||
|
|
"step": 2020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.334515047073364,
|
||
|
|
"epoch": 0.17013232514177692,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004999164177476319,
|
||
|
|
"loss": 6.1138,
|
||
|
|
"mean_token_accuracy": 0.13388336971402168,
|
||
|
|
"num_tokens": 3739696.0,
|
||
|
|
"step": 2025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.170955038070678,
|
||
|
|
"epoch": 0.17055240495694182,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004999155995731009,
|
||
|
|
"loss": 6.1168,
|
||
|
|
"mean_token_accuracy": 0.1329979881644249,
|
||
|
|
"num_tokens": 3748675.0,
|
||
|
|
"step": 2030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.440923643112183,
|
||
|
|
"epoch": 0.1709724847721067,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999147774143057,
|
||
|
|
"loss": 6.1895,
|
||
|
|
"mean_token_accuracy": 0.12849014177918433,
|
||
|
|
"num_tokens": 3757714.0,
|
||
|
|
"step": 2035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.217456531524658,
|
||
|
|
"epoch": 0.1713925645872716,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000499913951271261,
|
||
|
|
"loss": 6.0181,
|
||
|
|
"mean_token_accuracy": 0.13668849244713782,
|
||
|
|
"num_tokens": 3767589.0,
|
||
|
|
"step": 2040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.216994047164917,
|
||
|
|
"epoch": 0.17181264440243646,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004999131211439816,
|
||
|
|
"loss": 6.1246,
|
||
|
|
"mean_token_accuracy": 0.13397686704993247,
|
||
|
|
"num_tokens": 3777261.0,
|
||
|
|
"step": 2045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.3198566913604735,
|
||
|
|
"epoch": 0.17223272421760136,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499912287032482,
|
||
|
|
"loss": 6.0738,
|
||
|
|
"mean_token_accuracy": 0.13602124899625778,
|
||
|
|
"num_tokens": 3786658.0,
|
||
|
|
"step": 2050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.19984622001648,
|
||
|
|
"epoch": 0.17265280403276623,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499911448936777,
|
||
|
|
"loss": 6.0669,
|
||
|
|
"mean_token_accuracy": 0.14067015573382377,
|
||
|
|
"num_tokens": 3794977.0,
|
||
|
|
"step": 2055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.179085731506348,
|
||
|
|
"epoch": 0.1730728838479311,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004999106068568816,
|
||
|
|
"loss": 6.1457,
|
||
|
|
"mean_token_accuracy": 0.12947675883769988,
|
||
|
|
"num_tokens": 3805138.0,
|
||
|
|
"step": 2060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.279845762252807,
|
||
|
|
"epoch": 0.173492963663096,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999097607928106,
|
||
|
|
"loss": 6.0911,
|
||
|
|
"mean_token_accuracy": 0.13879665359854698,
|
||
|
|
"num_tokens": 3814444.0,
|
||
|
|
"step": 2065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.212150764465332,
|
||
|
|
"epoch": 0.17391304347826086,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004999089107445788,
|
||
|
|
"loss": 6.0398,
|
||
|
|
"mean_token_accuracy": 0.13306153938174248,
|
||
|
|
"num_tokens": 3822859.0,
|
||
|
|
"step": 2070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.133330869674682,
|
||
|
|
"epoch": 0.17433312329342576,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004999080567122016,
|
||
|
|
"loss": 6.0707,
|
||
|
|
"mean_token_accuracy": 0.13198764845728875,
|
||
|
|
"num_tokens": 3833159.0,
|
||
|
|
"step": 2075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.295455646514893,
|
||
|
|
"epoch": 0.17475320310859063,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004999071986956941,
|
||
|
|
"loss": 6.0856,
|
||
|
|
"mean_token_accuracy": 0.13797224685549736,
|
||
|
|
"num_tokens": 3842136.0,
|
||
|
|
"step": 2080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.208657741546631,
|
||
|
|
"epoch": 0.1751732829237555,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999063366950713,
|
||
|
|
"loss": 6.1499,
|
||
|
|
"mean_token_accuracy": 0.12877421900629998,
|
||
|
|
"num_tokens": 3851406.0,
|
||
|
|
"step": 2085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.217505025863647,
|
||
|
|
"epoch": 0.1755933627389204,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004999054707103486,
|
||
|
|
"loss": 6.0713,
|
||
|
|
"mean_token_accuracy": 0.1279774695634842,
|
||
|
|
"num_tokens": 3861061.0,
|
||
|
|
"step": 2090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.265169095993042,
|
||
|
|
"epoch": 0.17601344255408527,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999046007415412,
|
||
|
|
"loss": 6.0378,
|
||
|
|
"mean_token_accuracy": 0.12900712937116623,
|
||
|
|
"num_tokens": 3870357.0,
|
||
|
|
"step": 2095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2917054176330565,
|
||
|
|
"epoch": 0.17643352236925017,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004999037267886646,
|
||
|
|
"loss": 6.0715,
|
||
|
|
"mean_token_accuracy": 0.13141706436872483,
|
||
|
|
"num_tokens": 3879393.0,
|
||
|
|
"step": 2100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.180794954299927,
|
||
|
|
"epoch": 0.17685360218441504,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004999028488517343,
|
||
|
|
"loss": 6.0832,
|
||
|
|
"mean_token_accuracy": 0.13525146320462228,
|
||
|
|
"num_tokens": 3888030.0,
|
||
|
|
"step": 2105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.266747093200683,
|
||
|
|
"epoch": 0.1772736819995799,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004999019669307659,
|
||
|
|
"loss": 6.0788,
|
||
|
|
"mean_token_accuracy": 0.1376435212790966,
|
||
|
|
"num_tokens": 3897430.0,
|
||
|
|
"step": 2110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.238908100128174,
|
||
|
|
"epoch": 0.1776937618147448,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004999010810257749,
|
||
|
|
"loss": 6.0977,
|
||
|
|
"mean_token_accuracy": 0.12719068825244903,
|
||
|
|
"num_tokens": 3907711.0,
|
||
|
|
"step": 2115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.189173746109009,
|
||
|
|
"epoch": 0.17811384162990967,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004999001911367771,
|
||
|
|
"loss": 6.0411,
|
||
|
|
"mean_token_accuracy": 0.13638337776064874,
|
||
|
|
"num_tokens": 3915816.0,
|
||
|
|
"step": 2120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.22648811340332,
|
||
|
|
"epoch": 0.17853392144507457,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004998992972637883,
|
||
|
|
"loss": 6.1538,
|
||
|
|
"mean_token_accuracy": 0.12582943066954613,
|
||
|
|
"num_tokens": 3925162.0,
|
||
|
|
"step": 2125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.284874153137207,
|
||
|
|
"epoch": 0.17895400126023944,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004998983994068242,
|
||
|
|
"loss": 6.0395,
|
||
|
|
"mean_token_accuracy": 0.13122835606336594,
|
||
|
|
"num_tokens": 3934476.0,
|
||
|
|
"step": 2130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.186276054382324,
|
||
|
|
"epoch": 0.17937408107540434,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004998974975659006,
|
||
|
|
"loss": 6.0907,
|
||
|
|
"mean_token_accuracy": 0.1297646477818489,
|
||
|
|
"num_tokens": 3943501.0,
|
||
|
|
"step": 2135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.205726194381714,
|
||
|
|
"epoch": 0.1797941608905692,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004998965917410338,
|
||
|
|
"loss": 6.0816,
|
||
|
|
"mean_token_accuracy": 0.12778471410274506,
|
||
|
|
"num_tokens": 3953663.0,
|
||
|
|
"step": 2140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.211074018478394,
|
||
|
|
"epoch": 0.18021424070573408,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998956819322397,
|
||
|
|
"loss": 6.0495,
|
||
|
|
"mean_token_accuracy": 0.13608243688941002,
|
||
|
|
"num_tokens": 3962634.0,
|
||
|
|
"step": 2145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.177238512039184,
|
||
|
|
"epoch": 0.18063432052089898,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004998947681395343,
|
||
|
|
"loss": 6.052,
|
||
|
|
"mean_token_accuracy": 0.13605224341154099,
|
||
|
|
"num_tokens": 3972496.0,
|
||
|
|
"step": 2150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.390697908401489,
|
||
|
|
"epoch": 0.18105440033606385,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499893850362934,
|
||
|
|
"loss": 6.2977,
|
||
|
|
"mean_token_accuracy": 0.12441082820296287,
|
||
|
|
"num_tokens": 3980724.0,
|
||
|
|
"step": 2155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.262918901443482,
|
||
|
|
"epoch": 0.18147448015122875,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998929286024548,
|
||
|
|
"loss": 6.1304,
|
||
|
|
"mean_token_accuracy": 0.1300631955265999,
|
||
|
|
"num_tokens": 3989842.0,
|
||
|
|
"step": 2160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.230935716629029,
|
||
|
|
"epoch": 0.18189455996639362,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004998920028581133,
|
||
|
|
"loss": 6.0378,
|
||
|
|
"mean_token_accuracy": 0.14167480319738388,
|
||
|
|
"num_tokens": 3998534.0,
|
||
|
|
"step": 2165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.241239356994629,
|
||
|
|
"epoch": 0.18231463978155849,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998910731299258,
|
||
|
|
"loss": 6.0631,
|
||
|
|
"mean_token_accuracy": 0.13066420927643776,
|
||
|
|
"num_tokens": 4007677.0,
|
||
|
|
"step": 2170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.19789605140686,
|
||
|
|
"epoch": 0.18273471959672338,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998901394179085,
|
||
|
|
"loss": 6.1007,
|
||
|
|
"mean_token_accuracy": 0.12627347633242608,
|
||
|
|
"num_tokens": 4016347.0,
|
||
|
|
"step": 2175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.198655843734741,
|
||
|
|
"epoch": 0.18315479941188825,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998892017220784,
|
||
|
|
"loss": 5.9767,
|
||
|
|
"mean_token_accuracy": 0.14088783264160157,
|
||
|
|
"num_tokens": 4025199.0,
|
||
|
|
"step": 2180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.262273931503296,
|
||
|
|
"epoch": 0.18357487922705315,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004998882600424519,
|
||
|
|
"loss": 6.0603,
|
||
|
|
"mean_token_accuracy": 0.1286892294883728,
|
||
|
|
"num_tokens": 4033933.0,
|
||
|
|
"step": 2185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.162368822097778,
|
||
|
|
"epoch": 0.18399495904221802,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004998873143790455,
|
||
|
|
"loss": 5.9753,
|
||
|
|
"mean_token_accuracy": 0.1438771367073059,
|
||
|
|
"num_tokens": 4042891.0,
|
||
|
|
"step": 2190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.274066638946533,
|
||
|
|
"epoch": 0.1844150388573829,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004998863647318763,
|
||
|
|
"loss": 6.1041,
|
||
|
|
"mean_token_accuracy": 0.13264708146452903,
|
||
|
|
"num_tokens": 4051123.0,
|
||
|
|
"step": 2195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.144877004623413,
|
||
|
|
"epoch": 0.1848351186725478,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998854111009608,
|
||
|
|
"loss": 6.0715,
|
||
|
|
"mean_token_accuracy": 0.12865814492106437,
|
||
|
|
"num_tokens": 4060025.0,
|
||
|
|
"step": 2200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.182585954666138,
|
||
|
|
"epoch": 0.18525519848771266,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004998844534863161,
|
||
|
|
"loss": 5.991,
|
||
|
|
"mean_token_accuracy": 0.1295328378677368,
|
||
|
|
"num_tokens": 4069363.0,
|
||
|
|
"step": 2205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.241155099868775,
|
||
|
|
"epoch": 0.18567527830287756,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004998834918879592,
|
||
|
|
"loss": 6.1376,
|
||
|
|
"mean_token_accuracy": 0.133307021856308,
|
||
|
|
"num_tokens": 4078855.0,
|
||
|
|
"step": 2210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.206245565414429,
|
||
|
|
"epoch": 0.18609535811804243,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000499882526305907,
|
||
|
|
"loss": 6.0804,
|
||
|
|
"mean_token_accuracy": 0.12953457087278367,
|
||
|
|
"num_tokens": 4087801.0,
|
||
|
|
"step": 2215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.248236179351807,
|
||
|
|
"epoch": 0.18651543793320732,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004998815567401765,
|
||
|
|
"loss": 6.0926,
|
||
|
|
"mean_token_accuracy": 0.1376325160264969,
|
||
|
|
"num_tokens": 4096949.0,
|
||
|
|
"step": 2220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.279425954818725,
|
||
|
|
"epoch": 0.1869355177483722,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998805831907851,
|
||
|
|
"loss": 6.0617,
|
||
|
|
"mean_token_accuracy": 0.13082574903964997,
|
||
|
|
"num_tokens": 4105399.0,
|
||
|
|
"step": 2225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.169968605041504,
|
||
|
|
"epoch": 0.18735559756353706,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998796056577501,
|
||
|
|
"loss": 6.0071,
|
||
|
|
"mean_token_accuracy": 0.12926321402192115,
|
||
|
|
"num_tokens": 4113873.0,
|
||
|
|
"step": 2230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.154512643814087,
|
||
|
|
"epoch": 0.18777567737870196,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004998786241410886,
|
||
|
|
"loss": 6.0586,
|
||
|
|
"mean_token_accuracy": 0.13699585050344468,
|
||
|
|
"num_tokens": 4123528.0,
|
||
|
|
"step": 2235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2988721370697025,
|
||
|
|
"epoch": 0.18819575719386683,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.000499877638640818,
|
||
|
|
"loss": 6.0699,
|
||
|
|
"mean_token_accuracy": 0.13017342165112494,
|
||
|
|
"num_tokens": 4133370.0,
|
||
|
|
"step": 2240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.184452104568481,
|
||
|
|
"epoch": 0.18861583700903173,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.000499876649156956,
|
||
|
|
"loss": 5.9844,
|
||
|
|
"mean_token_accuracy": 0.13666255846619607,
|
||
|
|
"num_tokens": 4142370.0,
|
||
|
|
"step": 2245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.133312082290649,
|
||
|
|
"epoch": 0.1890359168241966,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998756556895196,
|
||
|
|
"loss": 6.0725,
|
||
|
|
"mean_token_accuracy": 0.1354515865445137,
|
||
|
|
"num_tokens": 4152367.0,
|
||
|
|
"step": 2250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.21663122177124,
|
||
|
|
"epoch": 0.18945599663936147,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499874658238527,
|
||
|
|
"loss": 6.0625,
|
||
|
|
"mean_token_accuracy": 0.13495326191186904,
|
||
|
|
"num_tokens": 4161126.0,
|
||
|
|
"step": 2255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.186970901489258,
|
||
|
|
"epoch": 0.18987607645452637,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998736568039957,
|
||
|
|
"loss": 5.9748,
|
||
|
|
"mean_token_accuracy": 0.13723411411046982,
|
||
|
|
"num_tokens": 4169910.0,
|
||
|
|
"step": 2260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1857301712036135,
|
||
|
|
"epoch": 0.19029615626969124,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004998726513859432,
|
||
|
|
"loss": 6.1067,
|
||
|
|
"mean_token_accuracy": 0.12761787325143814,
|
||
|
|
"num_tokens": 4179893.0,
|
||
|
|
"step": 2265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.308238935470581,
|
||
|
|
"epoch": 0.19071623608485613,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004998716419843875,
|
||
|
|
"loss": 6.12,
|
||
|
|
"mean_token_accuracy": 0.13745217099785806,
|
||
|
|
"num_tokens": 4190065.0,
|
||
|
|
"step": 2270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.090948486328125,
|
||
|
|
"epoch": 0.191136315900021,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004998706285993465,
|
||
|
|
"loss": 6.0313,
|
||
|
|
"mean_token_accuracy": 0.1420229621231556,
|
||
|
|
"num_tokens": 4198395.0,
|
||
|
|
"step": 2275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.282499647140503,
|
||
|
|
"epoch": 0.19155639571518587,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004998696112308381,
|
||
|
|
"loss": 6.0533,
|
||
|
|
"mean_token_accuracy": 0.1310360386967659,
|
||
|
|
"num_tokens": 4207555.0,
|
||
|
|
"step": 2280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.088230180740356,
|
||
|
|
"epoch": 0.19197647553035077,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004998685898788803,
|
||
|
|
"loss": 5.9946,
|
||
|
|
"mean_token_accuracy": 0.13536595478653907,
|
||
|
|
"num_tokens": 4216533.0,
|
||
|
|
"step": 2285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.274929618835449,
|
||
|
|
"epoch": 0.19239655534551564,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998675645434914,
|
||
|
|
"loss": 6.1095,
|
||
|
|
"mean_token_accuracy": 0.13767784610390663,
|
||
|
|
"num_tokens": 4225575.0,
|
||
|
|
"step": 2290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.153714513778686,
|
||
|
|
"epoch": 0.19281663516068054,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998665352246891,
|
||
|
|
"loss": 5.8958,
|
||
|
|
"mean_token_accuracy": 0.14245088025927544,
|
||
|
|
"num_tokens": 4234306.0,
|
||
|
|
"step": 2295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.08680305480957,
|
||
|
|
"epoch": 0.1932367149758454,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004998655019224921,
|
||
|
|
"loss": 6.0823,
|
||
|
|
"mean_token_accuracy": 0.1359329827129841,
|
||
|
|
"num_tokens": 4243998.0,
|
||
|
|
"step": 2300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.237053394317627,
|
||
|
|
"epoch": 0.19365679479101028,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004998644646369185,
|
||
|
|
"loss": 5.9776,
|
||
|
|
"mean_token_accuracy": 0.13352483361959458,
|
||
|
|
"num_tokens": 4253653.0,
|
||
|
|
"step": 2305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.139167737960816,
|
||
|
|
"epoch": 0.19407687460617518,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004998634233679865,
|
||
|
|
"loss": 6.0652,
|
||
|
|
"mean_token_accuracy": 0.1278400629758835,
|
||
|
|
"num_tokens": 4263305.0,
|
||
|
|
"step": 2310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.127392339706421,
|
||
|
|
"epoch": 0.19449695442134005,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499862378115715,
|
||
|
|
"loss": 5.9342,
|
||
|
|
"mean_token_accuracy": 0.14543856382369996,
|
||
|
|
"num_tokens": 4272212.0,
|
||
|
|
"step": 2315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.305202007293701,
|
||
|
|
"epoch": 0.19491703423650494,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004998613288801221,
|
||
|
|
"loss": 6.1375,
|
||
|
|
"mean_token_accuracy": 0.13151465207338334,
|
||
|
|
"num_tokens": 4281445.0,
|
||
|
|
"step": 2320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2177956104278564,
|
||
|
|
"epoch": 0.1953371140516698,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004998602756612267,
|
||
|
|
"loss": 6.055,
|
||
|
|
"mean_token_accuracy": 0.1372949168086052,
|
||
|
|
"num_tokens": 4290938.0,
|
||
|
|
"step": 2325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.175972557067871,
|
||
|
|
"epoch": 0.1957571938668347,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998592184590471,
|
||
|
|
"loss": 6.0786,
|
||
|
|
"mean_token_accuracy": 0.13233636021614076,
|
||
|
|
"num_tokens": 4300022.0,
|
||
|
|
"step": 2330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134920358657837,
|
||
|
|
"epoch": 0.19617727368199958,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004998581572736024,
|
||
|
|
"loss": 5.9674,
|
||
|
|
"mean_token_accuracy": 0.1363460712134838,
|
||
|
|
"num_tokens": 4308910.0,
|
||
|
|
"step": 2335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.092206907272339,
|
||
|
|
"epoch": 0.19659735349716445,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004998570921049112,
|
||
|
|
"loss": 5.9454,
|
||
|
|
"mean_token_accuracy": 0.13969452679157257,
|
||
|
|
"num_tokens": 4317136.0,
|
||
|
|
"step": 2340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.112558746337891,
|
||
|
|
"epoch": 0.19701743331232935,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998560229529924,
|
||
|
|
"loss": 5.9993,
|
||
|
|
"mean_token_accuracy": 0.1428337089717388,
|
||
|
|
"num_tokens": 4326163.0,
|
||
|
|
"step": 2345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.308993816375732,
|
||
|
|
"epoch": 0.19743751312749422,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004998549498178649,
|
||
|
|
"loss": 6.1402,
|
||
|
|
"mean_token_accuracy": 0.13658420667052268,
|
||
|
|
"num_tokens": 4335837.0,
|
||
|
|
"step": 2350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.216946363449097,
|
||
|
|
"epoch": 0.19785759294265912,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004998538726995477,
|
||
|
|
"loss": 6.0561,
|
||
|
|
"mean_token_accuracy": 0.1374947391450405,
|
||
|
|
"num_tokens": 4345108.0,
|
||
|
|
"step": 2355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.217574787139893,
|
||
|
|
"epoch": 0.198277672757824,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.00049985279159806,
|
||
|
|
"loss": 6.0722,
|
||
|
|
"mean_token_accuracy": 0.1334306165575981,
|
||
|
|
"num_tokens": 4353761.0,
|
||
|
|
"step": 2360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1630774974823,
|
||
|
|
"epoch": 0.19869775257298886,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004998517065134208,
|
||
|
|
"loss": 6.0354,
|
||
|
|
"mean_token_accuracy": 0.13587109968066216,
|
||
|
|
"num_tokens": 4363244.0,
|
||
|
|
"step": 2365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.205533790588379,
|
||
|
|
"epoch": 0.19911783238815375,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004998506174456494,
|
||
|
|
"loss": 6.0386,
|
||
|
|
"mean_token_accuracy": 0.13257589265704156,
|
||
|
|
"num_tokens": 4373034.0,
|
||
|
|
"step": 2370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.200410652160644,
|
||
|
|
"epoch": 0.19953791220331862,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004998495243947653,
|
||
|
|
"loss": 5.9816,
|
||
|
|
"mean_token_accuracy": 0.13029902279376984,
|
||
|
|
"num_tokens": 4382554.0,
|
||
|
|
"step": 2375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.191087865829468,
|
||
|
|
"epoch": 0.19995799201848352,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998484273607875,
|
||
|
|
"loss": 5.9843,
|
||
|
|
"mean_token_accuracy": 0.14299238696694375,
|
||
|
|
"num_tokens": 4391001.0,
|
||
|
|
"step": 2380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.023518228530884,
|
||
|
|
"epoch": 0.2003780718336484,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004998473263437356,
|
||
|
|
"loss": 5.9141,
|
||
|
|
"mean_token_accuracy": 0.13673870489001275,
|
||
|
|
"num_tokens": 4400632.0,
|
||
|
|
"step": 2385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.105119514465332,
|
||
|
|
"epoch": 0.20079815164881326,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.000499846221343629,
|
||
|
|
"loss": 6.0095,
|
||
|
|
"mean_token_accuracy": 0.12952324375510216,
|
||
|
|
"num_tokens": 4409565.0,
|
||
|
|
"step": 2390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.128167533874512,
|
||
|
|
"epoch": 0.20121823146397816,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998451123604875,
|
||
|
|
"loss": 5.944,
|
||
|
|
"mean_token_accuracy": 0.14282809123396872,
|
||
|
|
"num_tokens": 4418384.0,
|
||
|
|
"step": 2395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1983355522155765,
|
||
|
|
"epoch": 0.20163831127914303,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004998439993943306,
|
||
|
|
"loss": 6.0692,
|
||
|
|
"mean_token_accuracy": 0.1389256276190281,
|
||
|
|
"num_tokens": 4427581.0,
|
||
|
|
"step": 2400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.267655086517334,
|
||
|
|
"epoch": 0.20205839109430793,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998428824451779,
|
||
|
|
"loss": 6.0521,
|
||
|
|
"mean_token_accuracy": 0.1341543450951576,
|
||
|
|
"num_tokens": 4436572.0,
|
||
|
|
"step": 2405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1763083934783936,
|
||
|
|
"epoch": 0.2024784709094728,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998417615130495,
|
||
|
|
"loss": 6.055,
|
||
|
|
"mean_token_accuracy": 0.13537125810980796,
|
||
|
|
"num_tokens": 4445230.0,
|
||
|
|
"step": 2410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.247248315811158,
|
||
|
|
"epoch": 0.2028985507246377,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004998406365979649,
|
||
|
|
"loss": 6.1134,
|
||
|
|
"mean_token_accuracy": 0.13383878991007805,
|
||
|
|
"num_tokens": 4454251.0,
|
||
|
|
"step": 2415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.136447811126709,
|
||
|
|
"epoch": 0.20331863053980256,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004998395076999443,
|
||
|
|
"loss": 5.9699,
|
||
|
|
"mean_token_accuracy": 0.13695907220244408,
|
||
|
|
"num_tokens": 4463949.0,
|
||
|
|
"step": 2420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.227413558959961,
|
||
|
|
"epoch": 0.20373871035496743,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998383748190076,
|
||
|
|
"loss": 6.1649,
|
||
|
|
"mean_token_accuracy": 0.12917085587978364,
|
||
|
|
"num_tokens": 4473373.0,
|
||
|
|
"step": 2425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.249214363098145,
|
||
|
|
"epoch": 0.20415879017013233,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998372379551748,
|
||
|
|
"loss": 5.9842,
|
||
|
|
"mean_token_accuracy": 0.1414948470890522,
|
||
|
|
"num_tokens": 4482303.0,
|
||
|
|
"step": 2430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.117572832107544,
|
||
|
|
"epoch": 0.2045788699852972,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998360971084663,
|
||
|
|
"loss": 5.9567,
|
||
|
|
"mean_token_accuracy": 0.1317524030804634,
|
||
|
|
"num_tokens": 4491214.0,
|
||
|
|
"step": 2435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.057681226730347,
|
||
|
|
"epoch": 0.2049989498004621,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004998349522789019,
|
||
|
|
"loss": 5.8856,
|
||
|
|
"mean_token_accuracy": 0.14377139806747435,
|
||
|
|
"num_tokens": 4500099.0,
|
||
|
|
"step": 2440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.115459060668945,
|
||
|
|
"epoch": 0.20541902961562697,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998338034665021,
|
||
|
|
"loss": 5.9692,
|
||
|
|
"mean_token_accuracy": 0.1437109664082527,
|
||
|
|
"num_tokens": 4509893.0,
|
||
|
|
"step": 2445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.08744249343872,
|
||
|
|
"epoch": 0.20583910943079184,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004998326506712872,
|
||
|
|
"loss": 5.9375,
|
||
|
|
"mean_token_accuracy": 0.13774847760796546,
|
||
|
|
"num_tokens": 4518606.0,
|
||
|
|
"step": 2450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.11673412322998,
|
||
|
|
"epoch": 0.20625918924595674,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004998314938932778,
|
||
|
|
"loss": 6.0218,
|
||
|
|
"mean_token_accuracy": 0.14001012295484544,
|
||
|
|
"num_tokens": 4528392.0,
|
||
|
|
"step": 2455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.221143388748169,
|
||
|
|
"epoch": 0.2066792690611216,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998303331324943,
|
||
|
|
"loss": 5.9923,
|
||
|
|
"mean_token_accuracy": 0.13821439668536187,
|
||
|
|
"num_tokens": 4536983.0,
|
||
|
|
"step": 2460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.041988134384155,
|
||
|
|
"epoch": 0.2070993488762865,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998291683889571,
|
||
|
|
"loss": 5.9145,
|
||
|
|
"mean_token_accuracy": 0.1391140677034855,
|
||
|
|
"num_tokens": 4544967.0,
|
||
|
|
"step": 2465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.134957313537598,
|
||
|
|
"epoch": 0.20751942869145137,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499827999662687,
|
||
|
|
"loss": 5.9727,
|
||
|
|
"mean_token_accuracy": 0.13200750946998596,
|
||
|
|
"num_tokens": 4554646.0,
|
||
|
|
"step": 2470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.192252588272095,
|
||
|
|
"epoch": 0.20793950850661624,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004998268269537046,
|
||
|
|
"loss": 5.9954,
|
||
|
|
"mean_token_accuracy": 0.1370847873389721,
|
||
|
|
"num_tokens": 4564040.0,
|
||
|
|
"step": 2475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.091167068481445,
|
||
|
|
"epoch": 0.20835958832178114,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998256502620308,
|
||
|
|
"loss": 6.0187,
|
||
|
|
"mean_token_accuracy": 0.14094985872507096,
|
||
|
|
"num_tokens": 4573758.0,
|
||
|
|
"step": 2480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.206011056900024,
|
||
|
|
"epoch": 0.208779668136946,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004998244695876864,
|
||
|
|
"loss": 6.0452,
|
||
|
|
"mean_token_accuracy": 0.13380730673670768,
|
||
|
|
"num_tokens": 4582097.0,
|
||
|
|
"step": 2485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0949585914611815,
|
||
|
|
"epoch": 0.2091997479521109,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004998232849306921,
|
||
|
|
"loss": 6.0055,
|
||
|
|
"mean_token_accuracy": 0.13993047401309014,
|
||
|
|
"num_tokens": 4590687.0,
|
||
|
|
"step": 2490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1933338165283205,
|
||
|
|
"epoch": 0.20961982776727578,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998220962910693,
|
||
|
|
"loss": 5.9965,
|
||
|
|
"mean_token_accuracy": 0.13453714549541473,
|
||
|
|
"num_tokens": 4599497.0,
|
||
|
|
"step": 2495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.101396179199218,
|
||
|
|
"epoch": 0.21003990758244068,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004998209036688386,
|
||
|
|
"loss": 5.9532,
|
||
|
|
"mean_token_accuracy": 0.13716981932520866,
|
||
|
|
"num_tokens": 4607958.0,
|
||
|
|
"step": 2500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.216299772262573,
|
||
|
|
"epoch": 0.21045998739760555,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004998197070640216,
|
||
|
|
"loss": 6.0812,
|
||
|
|
"mean_token_accuracy": 0.1314453199505806,
|
||
|
|
"num_tokens": 4617515.0,
|
||
|
|
"step": 2505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2111225605010985,
|
||
|
|
"epoch": 0.21088006721277042,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998185064766391,
|
||
|
|
"loss": 5.9892,
|
||
|
|
"mean_token_accuracy": 0.135587390512228,
|
||
|
|
"num_tokens": 4627037.0,
|
||
|
|
"step": 2510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.083059787750244,
|
||
|
|
"epoch": 0.21130014702793531,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004998173019067127,
|
||
|
|
"loss": 5.9864,
|
||
|
|
"mean_token_accuracy": 0.13536423593759536,
|
||
|
|
"num_tokens": 4637393.0,
|
||
|
|
"step": 2515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.111885261535645,
|
||
|
|
"epoch": 0.21172022684310018,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004998160933542633,
|
||
|
|
"loss": 6.0252,
|
||
|
|
"mean_token_accuracy": 0.12426691725850106,
|
||
|
|
"num_tokens": 4646832.0,
|
||
|
|
"step": 2520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.200415229797363,
|
||
|
|
"epoch": 0.21214030665826508,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004998148808193128,
|
||
|
|
"loss": 6.0364,
|
||
|
|
"mean_token_accuracy": 0.1378290109336376,
|
||
|
|
"num_tokens": 4655719.0,
|
||
|
|
"step": 2525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.140298128128052,
|
||
|
|
"epoch": 0.21256038647342995,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004998136643018823,
|
||
|
|
"loss": 5.9978,
|
||
|
|
"mean_token_accuracy": 0.1409161224961281,
|
||
|
|
"num_tokens": 4665364.0,
|
||
|
|
"step": 2530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.113859462738037,
|
||
|
|
"epoch": 0.21298046628859482,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004998124438019935,
|
||
|
|
"loss": 5.9707,
|
||
|
|
"mean_token_accuracy": 0.13255369514226914,
|
||
|
|
"num_tokens": 4674760.0,
|
||
|
|
"step": 2535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.032169342041016,
|
||
|
|
"epoch": 0.21340054610375972,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004998112193196681,
|
||
|
|
"loss": 5.8954,
|
||
|
|
"mean_token_accuracy": 0.1398087151348591,
|
||
|
|
"num_tokens": 4683900.0,
|
||
|
|
"step": 2540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.009505701065064,
|
||
|
|
"epoch": 0.2138206259189246,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004998099908549277,
|
||
|
|
"loss": 5.9487,
|
||
|
|
"mean_token_accuracy": 0.1326383799314499,
|
||
|
|
"num_tokens": 4693915.0,
|
||
|
|
"step": 2545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.048102998733521,
|
||
|
|
"epoch": 0.2142407057340895,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.000499808758407794,
|
||
|
|
"loss": 5.7948,
|
||
|
|
"mean_token_accuracy": 0.1494914174079895,
|
||
|
|
"num_tokens": 4703102.0,
|
||
|
|
"step": 2550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.130202150344848,
|
||
|
|
"epoch": 0.21466078554925436,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004998075219782889,
|
||
|
|
"loss": 6.0201,
|
||
|
|
"mean_token_accuracy": 0.13604088351130486,
|
||
|
|
"num_tokens": 4712925.0,
|
||
|
|
"step": 2555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.086578845977783,
|
||
|
|
"epoch": 0.21508086536441923,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004998062815664344,
|
||
|
|
"loss": 5.9508,
|
||
|
|
"mean_token_accuracy": 0.13391971811652184,
|
||
|
|
"num_tokens": 4722641.0,
|
||
|
|
"step": 2560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.060202693939209,
|
||
|
|
"epoch": 0.21550094517958412,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004998050371722524,
|
||
|
|
"loss": 6.028,
|
||
|
|
"mean_token_accuracy": 0.13827937468886375,
|
||
|
|
"num_tokens": 4732603.0,
|
||
|
|
"step": 2565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.060051965713501,
|
||
|
|
"epoch": 0.215921024994749,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004998037887957649,
|
||
|
|
"loss": 5.8655,
|
||
|
|
"mean_token_accuracy": 0.1426350235939026,
|
||
|
|
"num_tokens": 4742644.0,
|
||
|
|
"step": 2570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.2458967685699465,
|
||
|
|
"epoch": 0.2163411048099139,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004998025364369939,
|
||
|
|
"loss": 6.1759,
|
||
|
|
"mean_token_accuracy": 0.1332129217684269,
|
||
|
|
"num_tokens": 4751482.0,
|
||
|
|
"step": 2575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.246464967727661,
|
||
|
|
"epoch": 0.21676118462507876,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004998012800959619,
|
||
|
|
"loss": 6.0435,
|
||
|
|
"mean_token_accuracy": 0.13494925051927567,
|
||
|
|
"num_tokens": 4760593.0,
|
||
|
|
"step": 2580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.139482402801514,
|
||
|
|
"epoch": 0.21718126444024366,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004998000197726909,
|
||
|
|
"loss": 6.041,
|
||
|
|
"mean_token_accuracy": 0.14071242287755012,
|
||
|
|
"num_tokens": 4769294.0,
|
||
|
|
"step": 2585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.151182079315186,
|
||
|
|
"epoch": 0.21760134425540853,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004997987554672033,
|
||
|
|
"loss": 5.9433,
|
||
|
|
"mean_token_accuracy": 0.13458855599164962,
|
||
|
|
"num_tokens": 4779239.0,
|
||
|
|
"step": 2590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.153560495376587,
|
||
|
|
"epoch": 0.2180214240705734,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004997974871795215,
|
||
|
|
"loss": 6.0165,
|
||
|
|
"mean_token_accuracy": 0.13904761373996735,
|
||
|
|
"num_tokens": 4788211.0,
|
||
|
|
"step": 2595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1266923427581785,
|
||
|
|
"epoch": 0.2184415038857383,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.000499796214909668,
|
||
|
|
"loss": 5.9707,
|
||
|
|
"mean_token_accuracy": 0.14307306259870528,
|
||
|
|
"num_tokens": 4797921.0,
|
||
|
|
"step": 2600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.151721715927124,
|
||
|
|
"epoch": 0.21886158370090317,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004997949386576653,
|
||
|
|
"loss": 5.9792,
|
||
|
|
"mean_token_accuracy": 0.1372672997415066,
|
||
|
|
"num_tokens": 4807772.0,
|
||
|
|
"step": 2605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.999966764450074,
|
||
|
|
"epoch": 0.21928166351606806,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.000499793658423536,
|
||
|
|
"loss": 6.0037,
|
||
|
|
"mean_token_accuracy": 0.13394766226410865,
|
||
|
|
"num_tokens": 4817999.0,
|
||
|
|
"step": 2610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.197027158737183,
|
||
|
|
"epoch": 0.21970174333123293,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004997923742073028,
|
||
|
|
"loss": 5.9552,
|
||
|
|
"mean_token_accuracy": 0.14477612674236298,
|
||
|
|
"num_tokens": 4826679.0,
|
||
|
|
"step": 2615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0403674125671385,
|
||
|
|
"epoch": 0.2201218231463978,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004997910860089884,
|
||
|
|
"loss": 5.9647,
|
||
|
|
"mean_token_accuracy": 0.13903913348913194,
|
||
|
|
"num_tokens": 4834998.0,
|
||
|
|
"step": 2620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.119702100753784,
|
||
|
|
"epoch": 0.2205419029615627,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004997897938286156,
|
||
|
|
"loss": 5.9173,
|
||
|
|
"mean_token_accuracy": 0.13934070989489555,
|
||
|
|
"num_tokens": 4843635.0,
|
||
|
|
"step": 2625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.135205316543579,
|
||
|
|
"epoch": 0.22096198277672757,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004997884976662075,
|
||
|
|
"loss": 6.0334,
|
||
|
|
"mean_token_accuracy": 0.13847846239805223,
|
||
|
|
"num_tokens": 4852027.0,
|
||
|
|
"step": 2630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.115947484970093,
|
||
|
|
"epoch": 0.22138206259189247,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004997871975217868,
|
||
|
|
"loss": 5.9555,
|
||
|
|
"mean_token_accuracy": 0.1428781971335411,
|
||
|
|
"num_tokens": 4861244.0,
|
||
|
|
"step": 2635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.043252468109131,
|
||
|
|
"epoch": 0.22180214240705734,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004997858933953768,
|
||
|
|
"loss": 5.8579,
|
||
|
|
"mean_token_accuracy": 0.14281281381845473,
|
||
|
|
"num_tokens": 4869902.0,
|
||
|
|
"step": 2640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.012739181518555,
|
||
|
|
"epoch": 0.2222222222222222,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004997845852870004,
|
||
|
|
"loss": 5.8421,
|
||
|
|
"mean_token_accuracy": 0.1463296964764595,
|
||
|
|
"num_tokens": 4878502.0,
|
||
|
|
"step": 2645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.089871215820312,
|
||
|
|
"epoch": 0.2226423020373871,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004997832731966806,
|
||
|
|
"loss": 5.9032,
|
||
|
|
"mean_token_accuracy": 0.14714645445346833,
|
||
|
|
"num_tokens": 4888348.0,
|
||
|
|
"step": 2650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.06225700378418,
|
||
|
|
"epoch": 0.22306238185255198,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004997819571244411,
|
||
|
|
"loss": 5.972,
|
||
|
|
"mean_token_accuracy": 0.1450254276394844,
|
||
|
|
"num_tokens": 4897302.0,
|
||
|
|
"step": 2655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0446860790252686,
|
||
|
|
"epoch": 0.22348246166771688,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004997806370703049,
|
||
|
|
"loss": 5.9876,
|
||
|
|
"mean_token_accuracy": 0.14430617392063141,
|
||
|
|
"num_tokens": 4907078.0,
|
||
|
|
"step": 2660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.057806348800659,
|
||
|
|
"epoch": 0.22390254148288175,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0004997793130342954,
|
||
|
|
"loss": 5.8272,
|
||
|
|
"mean_token_accuracy": 0.1456086441874504,
|
||
|
|
"num_tokens": 4917489.0,
|
||
|
|
"step": 2665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973814630508423,
|
||
|
|
"epoch": 0.22432262129804661,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004997779850164363,
|
||
|
|
"loss": 5.9156,
|
||
|
|
"mean_token_accuracy": 0.140571466088295,
|
||
|
|
"num_tokens": 4927073.0,
|
||
|
|
"step": 2670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.177860355377197,
|
||
|
|
"epoch": 0.2247427011132115,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004997766530167508,
|
||
|
|
"loss": 6.019,
|
||
|
|
"mean_token_accuracy": 0.1344543881714344,
|
||
|
|
"num_tokens": 4935464.0,
|
||
|
|
"step": 2675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.22092981338501,
|
||
|
|
"epoch": 0.22516278092837638,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004997753170352627,
|
||
|
|
"loss": 6.0914,
|
||
|
|
"mean_token_accuracy": 0.13605839386582375,
|
||
|
|
"num_tokens": 4944718.0,
|
||
|
|
"step": 2680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.105925226211548,
|
||
|
|
"epoch": 0.22558286074354128,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997739770719955,
|
||
|
|
"loss": 5.9844,
|
||
|
|
"mean_token_accuracy": 0.13587288782000542,
|
||
|
|
"num_tokens": 4954223.0,
|
||
|
|
"step": 2685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.107930469512939,
|
||
|
|
"epoch": 0.22600294055870615,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000499772633126973,
|
||
|
|
"loss": 6.0132,
|
||
|
|
"mean_token_accuracy": 0.13594387769699096,
|
||
|
|
"num_tokens": 4963371.0,
|
||
|
|
"step": 2690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.04271125793457,
|
||
|
|
"epoch": 0.22642302037387105,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004997712852002192,
|
||
|
|
"loss": 5.8679,
|
||
|
|
"mean_token_accuracy": 0.1471228800714016,
|
||
|
|
"num_tokens": 4972973.0,
|
||
|
|
"step": 2695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.086397647857666,
|
||
|
|
"epoch": 0.22684310018903592,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004997699332917578,
|
||
|
|
"loss": 6.1119,
|
||
|
|
"mean_token_accuracy": 0.12916670590639115,
|
||
|
|
"num_tokens": 4982808.0,
|
||
|
|
"step": 2700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.201492786407471,
|
||
|
|
"epoch": 0.2272631800042008,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004997685774016127,
|
||
|
|
"loss": 5.9896,
|
||
|
|
"mean_token_accuracy": 0.13685485795140268,
|
||
|
|
"num_tokens": 4992427.0,
|
||
|
|
"step": 2705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.162964010238648,
|
||
|
|
"epoch": 0.22768325981936569,
|
||
|
|
"grad_norm": 0.84375,
|
||
|
|
"learning_rate": 0.000499767217529808,
|
||
|
|
"loss": 6.1604,
|
||
|
|
"mean_token_accuracy": 0.12921097874641418,
|
||
|
|
"num_tokens": 5003562.0,
|
||
|
|
"step": 2710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098525857925415,
|
||
|
|
"epoch": 0.22810333963453056,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004997658536763678,
|
||
|
|
"loss": 5.8638,
|
||
|
|
"mean_token_accuracy": 0.1451013281941414,
|
||
|
|
"num_tokens": 5013429.0,
|
||
|
|
"step": 2715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.117339611053467,
|
||
|
|
"epoch": 0.22852341944969545,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004997644858413163,
|
||
|
|
"loss": 6.0022,
|
||
|
|
"mean_token_accuracy": 0.14247513711452484,
|
||
|
|
"num_tokens": 5022045.0,
|
||
|
|
"step": 2720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.008642053604126,
|
||
|
|
"epoch": 0.22894349926486032,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004997631140246775,
|
||
|
|
"loss": 5.8287,
|
||
|
|
"mean_token_accuracy": 0.14408515840768815,
|
||
|
|
"num_tokens": 5032260.0,
|
||
|
|
"step": 2725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.021863174438477,
|
||
|
|
"epoch": 0.2293635790800252,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000499761738226476,
|
||
|
|
"loss": 5.8626,
|
||
|
|
"mean_token_accuracy": 0.14258013665676117,
|
||
|
|
"num_tokens": 5041688.0,
|
||
|
|
"step": 2730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.056025457382202,
|
||
|
|
"epoch": 0.2297836588951901,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.000499760358446736,
|
||
|
|
"loss": 5.9702,
|
||
|
|
"mean_token_accuracy": 0.13718490228056907,
|
||
|
|
"num_tokens": 5051005.0,
|
||
|
|
"step": 2735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.152891635894775,
|
||
|
|
"epoch": 0.23020373871035496,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000499758974685482,
|
||
|
|
"loss": 5.9147,
|
||
|
|
"mean_token_accuracy": 0.13967233374714852,
|
||
|
|
"num_tokens": 5060084.0,
|
||
|
|
"step": 2740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059838390350341,
|
||
|
|
"epoch": 0.23062381852551986,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004997575869427385,
|
||
|
|
"loss": 5.9122,
|
||
|
|
"mean_token_accuracy": 0.14734914749860764,
|
||
|
|
"num_tokens": 5069081.0,
|
||
|
|
"step": 2745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0928624153137205,
|
||
|
|
"epoch": 0.23104389834068473,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.00049975619521853,
|
||
|
|
"loss": 5.9121,
|
||
|
|
"mean_token_accuracy": 0.13845374211668968,
|
||
|
|
"num_tokens": 5078597.0,
|
||
|
|
"step": 2750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.052087306976318,
|
||
|
|
"epoch": 0.2314639781558496,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004997547995128814,
|
||
|
|
"loss": 5.9554,
|
||
|
|
"mean_token_accuracy": 0.14530446976423264,
|
||
|
|
"num_tokens": 5087607.0,
|
||
|
|
"step": 2755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.094136476516724,
|
||
|
|
"epoch": 0.2318840579710145,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004997533998258171,
|
||
|
|
"loss": 5.9424,
|
||
|
|
"mean_token_accuracy": 0.14329736083745956,
|
||
|
|
"num_tokens": 5097412.0,
|
||
|
|
"step": 2760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.16567211151123,
|
||
|
|
"epoch": 0.23230413778617937,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004997519961573622,
|
||
|
|
"loss": 6.0152,
|
||
|
|
"mean_token_accuracy": 0.13348544016480446,
|
||
|
|
"num_tokens": 5105817.0,
|
||
|
|
"step": 2765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.226717376708985,
|
||
|
|
"epoch": 0.23272421760134426,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004997505885075414,
|
||
|
|
"loss": 6.0522,
|
||
|
|
"mean_token_accuracy": 0.13480133637785913,
|
||
|
|
"num_tokens": 5114958.0,
|
||
|
|
"step": 2770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084324312210083,
|
||
|
|
"epoch": 0.23314429741650913,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004997491768763795,
|
||
|
|
"loss": 5.9898,
|
||
|
|
"mean_token_accuracy": 0.13868246227502823,
|
||
|
|
"num_tokens": 5123728.0,
|
||
|
|
"step": 2775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.100927209854126,
|
||
|
|
"epoch": 0.23356437723167403,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004997477612639018,
|
||
|
|
"loss": 6.0218,
|
||
|
|
"mean_token_accuracy": 0.13395264372229576,
|
||
|
|
"num_tokens": 5134099.0,
|
||
|
|
"step": 2780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.162116241455078,
|
||
|
|
"epoch": 0.2339844570468389,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004997463416701332,
|
||
|
|
"loss": 6.0325,
|
||
|
|
"mean_token_accuracy": 0.13172747194766998,
|
||
|
|
"num_tokens": 5142934.0,
|
||
|
|
"step": 2785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.000607919692993,
|
||
|
|
"epoch": 0.23440453686200377,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004997449180950989,
|
||
|
|
"loss": 5.8681,
|
||
|
|
"mean_token_accuracy": 0.15649961084127426,
|
||
|
|
"num_tokens": 5151835.0,
|
||
|
|
"step": 2790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.038245487213135,
|
||
|
|
"epoch": 0.23482461667716867,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004997434905388241,
|
||
|
|
"loss": 5.921,
|
||
|
|
"mean_token_accuracy": 0.1477814018726349,
|
||
|
|
"num_tokens": 5161136.0,
|
||
|
|
"step": 2795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.029763174057007,
|
||
|
|
"epoch": 0.23524469649233354,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000499742059001334,
|
||
|
|
"loss": 5.8684,
|
||
|
|
"mean_token_accuracy": 0.14450337663292884,
|
||
|
|
"num_tokens": 5170741.0,
|
||
|
|
"step": 2800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.046102046966553,
|
||
|
|
"epoch": 0.23566477630749844,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004997406234826541,
|
||
|
|
"loss": 5.9001,
|
||
|
|
"mean_token_accuracy": 0.14729267880320548,
|
||
|
|
"num_tokens": 5180549.0,
|
||
|
|
"step": 2805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.980107164382934,
|
||
|
|
"epoch": 0.2360848561226633,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004997391839828098,
|
||
|
|
"loss": 5.8667,
|
||
|
|
"mean_token_accuracy": 0.14962306916713713,
|
||
|
|
"num_tokens": 5189486.0,
|
||
|
|
"step": 2810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.044159746170044,
|
||
|
|
"epoch": 0.23650493593782818,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004997377405018266,
|
||
|
|
"loss": 5.9303,
|
||
|
|
"mean_token_accuracy": 0.13750530928373336,
|
||
|
|
"num_tokens": 5198525.0,
|
||
|
|
"step": 2815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.075648498535156,
|
||
|
|
"epoch": 0.23692501575299307,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00049973629303973,
|
||
|
|
"loss": 5.9734,
|
||
|
|
"mean_token_accuracy": 0.14086321070790292,
|
||
|
|
"num_tokens": 5207124.0,
|
||
|
|
"step": 2820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.964286422729492,
|
||
|
|
"epoch": 0.23734509556815794,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004997348415965457,
|
||
|
|
"loss": 5.8079,
|
||
|
|
"mean_token_accuracy": 0.14603810012340546,
|
||
|
|
"num_tokens": 5216529.0,
|
||
|
|
"step": 2825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.12622709274292,
|
||
|
|
"epoch": 0.23776517538332284,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997333861722995,
|
||
|
|
"loss": 5.9402,
|
||
|
|
"mean_token_accuracy": 0.14331007972359658,
|
||
|
|
"num_tokens": 5225796.0,
|
||
|
|
"step": 2830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.085462188720703,
|
||
|
|
"epoch": 0.2381852551984877,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499731926767017,
|
||
|
|
"loss": 5.9732,
|
||
|
|
"mean_token_accuracy": 0.14003979936242103,
|
||
|
|
"num_tokens": 5233876.0,
|
||
|
|
"step": 2835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.016348743438721,
|
||
|
|
"epoch": 0.23860533501365258,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004997304633807242,
|
||
|
|
"loss": 5.9695,
|
||
|
|
"mean_token_accuracy": 0.13823127001523972,
|
||
|
|
"num_tokens": 5244782.0,
|
||
|
|
"step": 2840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.077929925918579,
|
||
|
|
"epoch": 0.23902541482881748,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004997289960134468,
|
||
|
|
"loss": 5.8993,
|
||
|
|
"mean_token_accuracy": 0.14192162305116654,
|
||
|
|
"num_tokens": 5253453.0,
|
||
|
|
"step": 2845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.049857330322266,
|
||
|
|
"epoch": 0.23944549464398235,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004997275246652111,
|
||
|
|
"loss": 5.9414,
|
||
|
|
"mean_token_accuracy": 0.14183279648423194,
|
||
|
|
"num_tokens": 5262355.0,
|
||
|
|
"step": 2850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.019342088699341,
|
||
|
|
"epoch": 0.23986557445914725,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.000499726049336043,
|
||
|
|
"loss": 5.8652,
|
||
|
|
"mean_token_accuracy": 0.14227822795510292,
|
||
|
|
"num_tokens": 5271959.0,
|
||
|
|
"step": 2855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.045290803909301,
|
||
|
|
"epoch": 0.24028565427431212,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004997245700259686,
|
||
|
|
"loss": 5.8938,
|
||
|
|
"mean_token_accuracy": 0.14394148513674737,
|
||
|
|
"num_tokens": 5281393.0,
|
||
|
|
"step": 2860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.126777935028076,
|
||
|
|
"epoch": 0.240705734089477,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004997230867350141,
|
||
|
|
"loss": 6.0153,
|
||
|
|
"mean_token_accuracy": 0.13795892894268036,
|
||
|
|
"num_tokens": 5290979.0,
|
||
|
|
"step": 2865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.170654964447022,
|
||
|
|
"epoch": 0.24112581390464188,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004997215994632059,
|
||
|
|
"loss": 5.9662,
|
||
|
|
"mean_token_accuracy": 0.1420626498758793,
|
||
|
|
"num_tokens": 5300263.0,
|
||
|
|
"step": 2870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098070096969605,
|
||
|
|
"epoch": 0.24154589371980675,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004997201082105704,
|
||
|
|
"loss": 5.9973,
|
||
|
|
"mean_token_accuracy": 0.1376795694231987,
|
||
|
|
"num_tokens": 5309522.0,
|
||
|
|
"step": 2875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.09854941368103,
|
||
|
|
"epoch": 0.24196597353497165,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997186129771338,
|
||
|
|
"loss": 5.9906,
|
||
|
|
"mean_token_accuracy": 0.1443823680281639,
|
||
|
|
"num_tokens": 5319770.0,
|
||
|
|
"step": 2880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.159392309188843,
|
||
|
|
"epoch": 0.24238605335013652,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004997171137629226,
|
||
|
|
"loss": 5.9994,
|
||
|
|
"mean_token_accuracy": 0.14119460731744765,
|
||
|
|
"num_tokens": 5328400.0,
|
||
|
|
"step": 2885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.00137939453125,
|
||
|
|
"epoch": 0.24280613316530142,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997156105679636,
|
||
|
|
"loss": 5.8054,
|
||
|
|
"mean_token_accuracy": 0.15445883423089982,
|
||
|
|
"num_tokens": 5336338.0,
|
||
|
|
"step": 2890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9904273509979244,
|
||
|
|
"epoch": 0.2432262129804663,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004997141033922832,
|
||
|
|
"loss": 5.8983,
|
||
|
|
"mean_token_accuracy": 0.1381608746945858,
|
||
|
|
"num_tokens": 5345391.0,
|
||
|
|
"step": 2895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.080091238021851,
|
||
|
|
"epoch": 0.24364629279563116,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004997125922359081,
|
||
|
|
"loss": 5.9345,
|
||
|
|
"mean_token_accuracy": 0.13472433462738992,
|
||
|
|
"num_tokens": 5354709.0,
|
||
|
|
"step": 2900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0483152866363525,
|
||
|
|
"epoch": 0.24406637261079606,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004997110770988652,
|
||
|
|
"loss": 5.8441,
|
||
|
|
"mean_token_accuracy": 0.14647466093301773,
|
||
|
|
"num_tokens": 5363738.0,
|
||
|
|
"step": 2905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.065390634536743,
|
||
|
|
"epoch": 0.24448645242596093,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004997095579811813,
|
||
|
|
"loss": 5.9742,
|
||
|
|
"mean_token_accuracy": 0.14132302552461623,
|
||
|
|
"num_tokens": 5373583.0,
|
||
|
|
"step": 2910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1408384323120115,
|
||
|
|
"epoch": 0.24490653224112582,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004997080348828833,
|
||
|
|
"loss": 6.0104,
|
||
|
|
"mean_token_accuracy": 0.14406906738877295,
|
||
|
|
"num_tokens": 5383486.0,
|
||
|
|
"step": 2915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.012083101272583,
|
||
|
|
"epoch": 0.2453266120562907,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004997065078039981,
|
||
|
|
"loss": 5.9283,
|
||
|
|
"mean_token_accuracy": 0.13883504942059516,
|
||
|
|
"num_tokens": 5391974.0,
|
||
|
|
"step": 2920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.098450088500977,
|
||
|
|
"epoch": 0.24574669187145556,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004997049767445529,
|
||
|
|
"loss": 5.9688,
|
||
|
|
"mean_token_accuracy": 0.13587900176644324,
|
||
|
|
"num_tokens": 5400882.0,
|
||
|
|
"step": 2925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1687455654144285,
|
||
|
|
"epoch": 0.24616677168662046,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004997034417045746,
|
||
|
|
"loss": 5.9199,
|
||
|
|
"mean_token_accuracy": 0.13755179792642594,
|
||
|
|
"num_tokens": 5410538.0,
|
||
|
|
"step": 2930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.019326400756836,
|
||
|
|
"epoch": 0.24658685150178533,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004997019026840907,
|
||
|
|
"loss": 5.8134,
|
||
|
|
"mean_token_accuracy": 0.14420632421970367,
|
||
|
|
"num_tokens": 5419406.0,
|
||
|
|
"step": 2935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9686970710754395,
|
||
|
|
"epoch": 0.24700693131695023,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004997003596831282,
|
||
|
|
"loss": 5.941,
|
||
|
|
"mean_token_accuracy": 0.13971618413925171,
|
||
|
|
"num_tokens": 5428817.0,
|
||
|
|
"step": 2940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.097631120681763,
|
||
|
|
"epoch": 0.2474270111321151,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996988127017145,
|
||
|
|
"loss": 5.9448,
|
||
|
|
"mean_token_accuracy": 0.13872243240475654,
|
||
|
|
"num_tokens": 5438277.0,
|
||
|
|
"step": 2945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.047083616256714,
|
||
|
|
"epoch": 0.24784709094728,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004996972617398772,
|
||
|
|
"loss": 5.974,
|
||
|
|
"mean_token_accuracy": 0.13909853398799896,
|
||
|
|
"num_tokens": 5447440.0,
|
||
|
|
"step": 2950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.065885257720947,
|
||
|
|
"epoch": 0.24826717076244487,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996957067976435,
|
||
|
|
"loss": 5.9005,
|
||
|
|
"mean_token_accuracy": 0.13819090723991395,
|
||
|
|
"num_tokens": 5455988.0,
|
||
|
|
"step": 2955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.079396390914917,
|
||
|
|
"epoch": 0.24868725057760974,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004996941478750411,
|
||
|
|
"loss": 5.895,
|
||
|
|
"mean_token_accuracy": 0.14170320481061935,
|
||
|
|
"num_tokens": 5464996.0,
|
||
|
|
"step": 2960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.131442737579346,
|
||
|
|
"epoch": 0.24910733039277463,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004996925849720975,
|
||
|
|
"loss": 6.0433,
|
||
|
|
"mean_token_accuracy": 0.13297844752669336,
|
||
|
|
"num_tokens": 5474174.0,
|
||
|
|
"step": 2965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.144496154785156,
|
||
|
|
"epoch": 0.2495274102079395,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004996910180888405,
|
||
|
|
"loss": 5.928,
|
||
|
|
"mean_token_accuracy": 0.14379495605826378,
|
||
|
|
"num_tokens": 5482838.0,
|
||
|
|
"step": 2970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.089239263534546,
|
||
|
|
"epoch": 0.2499474900231044,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004996894472252977,
|
||
|
|
"loss": 5.9339,
|
||
|
|
"mean_token_accuracy": 0.1420593172311783,
|
||
|
|
"num_tokens": 5491616.0,
|
||
|
|
"step": 2975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.992457008361816,
|
||
|
|
"epoch": 0.25036756983826924,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004996878723814973,
|
||
|
|
"loss": 5.9265,
|
||
|
|
"mean_token_accuracy": 0.13892921283841134,
|
||
|
|
"num_tokens": 5500942.0,
|
||
|
|
"step": 2980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.117427587509155,
|
||
|
|
"epoch": 0.25078764965343414,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004996862935574667,
|
||
|
|
"loss": 5.8788,
|
||
|
|
"mean_token_accuracy": 0.13912170454859735,
|
||
|
|
"num_tokens": 5510078.0,
|
||
|
|
"step": 2985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.943054437637329,
|
||
|
|
"epoch": 0.25120772946859904,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004996847107532342,
|
||
|
|
"loss": 5.9134,
|
||
|
|
"mean_token_accuracy": 0.14340257570147513,
|
||
|
|
"num_tokens": 5518924.0,
|
||
|
|
"step": 2990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.108536148071289,
|
||
|
|
"epoch": 0.25162780928376394,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004996831239688277,
|
||
|
|
"loss": 5.9216,
|
||
|
|
"mean_token_accuracy": 0.13749035373330115,
|
||
|
|
"num_tokens": 5527385.0,
|
||
|
|
"step": 2995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.977105903625488,
|
||
|
|
"epoch": 0.2520478890989288,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004996815332042754,
|
||
|
|
"loss": 5.766,
|
||
|
|
"mean_token_accuracy": 0.15047305673360825,
|
||
|
|
"num_tokens": 5536781.0,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2520478890989288,
|
||
|
|
"eval_entropy": 5.7445289912557636,
|
||
|
|
"eval_loss": 5.931798458099365,
|
||
|
|
"eval_mean_token_accuracy": 0.1480788363722414,
|
||
|
|
"eval_num_tokens": 5536781.0,
|
||
|
|
"eval_runtime": 21.0325,
|
||
|
|
"eval_samples_per_second": 1776.586,
|
||
|
|
"eval_steps_per_second": 222.085,
|
||
|
|
"step": 3000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.008361387252807,
|
||
|
|
"epoch": 0.2524679689140937,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004996799384596054,
|
||
|
|
"loss": 5.9477,
|
||
|
|
"mean_token_accuracy": 0.14386533573269844,
|
||
|
|
"num_tokens": 5545893.0,
|
||
|
|
"step": 3005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.112303066253662,
|
||
|
|
"epoch": 0.2528880487292586,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004996783397348461,
|
||
|
|
"loss": 5.9152,
|
||
|
|
"mean_token_accuracy": 0.13690555915236474,
|
||
|
|
"num_tokens": 5555818.0,
|
||
|
|
"step": 3010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.042035245895386,
|
||
|
|
"epoch": 0.2533081285444234,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0004996767370300256,
|
||
|
|
"loss": 5.8717,
|
||
|
|
"mean_token_accuracy": 0.14453656524419783,
|
||
|
|
"num_tokens": 5565331.0,
|
||
|
|
"step": 3015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.081929445266724,
|
||
|
|
"epoch": 0.2537282083595883,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004996751303451724,
|
||
|
|
"loss": 5.8599,
|
||
|
|
"mean_token_accuracy": 0.14481035768985748,
|
||
|
|
"num_tokens": 5574003.0,
|
||
|
|
"step": 3020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.977067756652832,
|
||
|
|
"epoch": 0.2541482881747532,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004996735196803149,
|
||
|
|
"loss": 5.7815,
|
||
|
|
"mean_token_accuracy": 0.15307400673627852,
|
||
|
|
"num_tokens": 5582517.0,
|
||
|
|
"step": 3025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.072621822357178,
|
||
|
|
"epoch": 0.2545683679899181,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004996719050354818,
|
||
|
|
"loss": 5.9948,
|
||
|
|
"mean_token_accuracy": 0.13989571258425712,
|
||
|
|
"num_tokens": 5591952.0,
|
||
|
|
"step": 3030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.03379979133606,
|
||
|
|
"epoch": 0.25498844780508295,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004996702864107015,
|
||
|
|
"loss": 5.8913,
|
||
|
|
"mean_token_accuracy": 0.14787303507328034,
|
||
|
|
"num_tokens": 5601460.0,
|
||
|
|
"step": 3035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.189465713500977,
|
||
|
|
"epoch": 0.25540852762024785,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996686638060028,
|
||
|
|
"loss": 6.0052,
|
||
|
|
"mean_token_accuracy": 0.13520606160163878,
|
||
|
|
"num_tokens": 5610776.0,
|
||
|
|
"step": 3040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.085352611541748,
|
||
|
|
"epoch": 0.25582860743541275,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004996670372214144,
|
||
|
|
"loss": 5.9054,
|
||
|
|
"mean_token_accuracy": 0.14562050476670266,
|
||
|
|
"num_tokens": 5619627.0,
|
||
|
|
"step": 3045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9095056533813475,
|
||
|
|
"epoch": 0.2562486872505776,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004996654066569651,
|
||
|
|
"loss": 5.7872,
|
||
|
|
"mean_token_accuracy": 0.14956104382872581,
|
||
|
|
"num_tokens": 5628969.0,
|
||
|
|
"step": 3050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.998289918899536,
|
||
|
|
"epoch": 0.2566687670657425,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004996637721126839,
|
||
|
|
"loss": 5.8501,
|
||
|
|
"mean_token_accuracy": 0.14419863522052764,
|
||
|
|
"num_tokens": 5638629.0,
|
||
|
|
"step": 3055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.084632110595703,
|
||
|
|
"epoch": 0.2570888468809074,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004996621335885996,
|
||
|
|
"loss": 5.9249,
|
||
|
|
"mean_token_accuracy": 0.13865133970975876,
|
||
|
|
"num_tokens": 5647571.0,
|
||
|
|
"step": 3060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.059264850616455,
|
||
|
|
"epoch": 0.2575089266960722,
|
||
|
|
"grad_norm": 1.21875,
|
||
|
|
"learning_rate": 0.0004996604910847413,
|
||
|
|
"loss": 5.8418,
|
||
|
|
"mean_token_accuracy": 0.1548224687576294,
|
||
|
|
"num_tokens": 5656709.0,
|
||
|
|
"step": 3065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.037788724899292,
|
||
|
|
"epoch": 0.2579290065112371,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000499658844601138,
|
||
|
|
"loss": 6.0136,
|
||
|
|
"mean_token_accuracy": 0.14061269238591195,
|
||
|
|
"num_tokens": 5665714.0,
|
||
|
|
"step": 3070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.112887382507324,
|
||
|
|
"epoch": 0.258349086326402,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.000499657194137819,
|
||
|
|
"loss": 5.9813,
|
||
|
|
"mean_token_accuracy": 0.1434816040098667,
|
||
|
|
"num_tokens": 5675854.0,
|
||
|
|
"step": 3075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.10079174041748,
|
||
|
|
"epoch": 0.2587691661415669,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004996555396948136,
|
||
|
|
"loss": 5.8062,
|
||
|
|
"mean_token_accuracy": 0.14445895925164223,
|
||
|
|
"num_tokens": 5685690.0,
|
||
|
|
"step": 3080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.008033037185669,
|
||
|
|
"epoch": 0.25918924595673176,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004996538812721509,
|
||
|
|
"loss": 5.8654,
|
||
|
|
"mean_token_accuracy": 0.14993129372596742,
|
||
|
|
"num_tokens": 5695766.0,
|
||
|
|
"step": 3085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.072084999084472,
|
||
|
|
"epoch": 0.25960932577189666,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004996522188698603,
|
||
|
|
"loss": 5.8982,
|
||
|
|
"mean_token_accuracy": 0.14610292240977288,
|
||
|
|
"num_tokens": 5704365.0,
|
||
|
|
"step": 3090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0555907726287845,
|
||
|
|
"epoch": 0.26002940558706156,
|
||
|
|
"grad_norm": 1.125,
|
||
|
|
"learning_rate": 0.0004996505524879714,
|
||
|
|
"loss": 6.0101,
|
||
|
|
"mean_token_accuracy": 0.14055205136537552,
|
||
|
|
"num_tokens": 5713345.0,
|
||
|
|
"step": 3095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.035314083099365,
|
||
|
|
"epoch": 0.2604494854022264,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004996488821265137,
|
||
|
|
"loss": 5.816,
|
||
|
|
"mean_token_accuracy": 0.14724740535020828,
|
||
|
|
"num_tokens": 5722907.0,
|
||
|
|
"step": 3100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.007513093948364,
|
||
|
|
"epoch": 0.2608695652173913,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996472077855166,
|
||
|
|
"loss": 5.8596,
|
||
|
|
"mean_token_accuracy": 0.1498942032456398,
|
||
|
|
"num_tokens": 5731589.0,
|
||
|
|
"step": 3105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.998636054992676,
|
||
|
|
"epoch": 0.2612896450325562,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.00049964552946501,
|
||
|
|
"loss": 5.8476,
|
||
|
|
"mean_token_accuracy": 0.1439466342329979,
|
||
|
|
"num_tokens": 5739922.0,
|
||
|
|
"step": 3110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9389458179473875,
|
||
|
|
"epoch": 0.2617097248477211,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004996438471650235,
|
||
|
|
"loss": 5.7675,
|
||
|
|
"mean_token_accuracy": 0.15062671899795532,
|
||
|
|
"num_tokens": 5749206.0,
|
||
|
|
"step": 3115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.008351278305054,
|
||
|
|
"epoch": 0.26212980466288593,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004996421608855869,
|
||
|
|
"loss": 5.8288,
|
||
|
|
"mean_token_accuracy": 0.15271472856402396,
|
||
|
|
"num_tokens": 5758803.0,
|
||
|
|
"step": 3120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.044885444641113,
|
||
|
|
"epoch": 0.26254988447805083,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004996404706267301,
|
||
|
|
"loss": 5.9065,
|
||
|
|
"mean_token_accuracy": 0.13532925099134446,
|
||
|
|
"num_tokens": 5768368.0,
|
||
|
|
"step": 3125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.958721733093261,
|
||
|
|
"epoch": 0.26296996429321573,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.000499638776388483,
|
||
|
|
"loss": 5.7648,
|
||
|
|
"mean_token_accuracy": 0.1534928262233734,
|
||
|
|
"num_tokens": 5776707.0,
|
||
|
|
"step": 3130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.986162996292114,
|
||
|
|
"epoch": 0.26339004410838057,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004996370781708757,
|
||
|
|
"loss": 5.9532,
|
||
|
|
"mean_token_accuracy": 0.13491747826337813,
|
||
|
|
"num_tokens": 5787037.0,
|
||
|
|
"step": 3135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.018689870834351,
|
||
|
|
"epoch": 0.26381012392354547,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004996353759739382,
|
||
|
|
"loss": 5.9005,
|
||
|
|
"mean_token_accuracy": 0.14967331141233445,
|
||
|
|
"num_tokens": 5796630.0,
|
||
|
|
"step": 3140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.985601377487183,
|
||
|
|
"epoch": 0.26423020373871037,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004996336697977007,
|
||
|
|
"loss": 5.8974,
|
||
|
|
"mean_token_accuracy": 0.14190822690725327,
|
||
|
|
"num_tokens": 5806402.0,
|
||
|
|
"step": 3145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99180235862732,
|
||
|
|
"epoch": 0.2646502835538752,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996319596421933,
|
||
|
|
"loss": 5.853,
|
||
|
|
"mean_token_accuracy": 0.14679677560925483,
|
||
|
|
"num_tokens": 5815742.0,
|
||
|
|
"step": 3150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.00025954246521,
|
||
|
|
"epoch": 0.2650703633690401,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004996302455074466,
|
||
|
|
"loss": 5.8679,
|
||
|
|
"mean_token_accuracy": 0.14232094436883927,
|
||
|
|
"num_tokens": 5824915.0,
|
||
|
|
"step": 3155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.032740592956543,
|
||
|
|
"epoch": 0.265490443184205,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004996285273934906,
|
||
|
|
"loss": 5.8901,
|
||
|
|
"mean_token_accuracy": 0.14556412398815155,
|
||
|
|
"num_tokens": 5834978.0,
|
||
|
|
"step": 3160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.078465604782105,
|
||
|
|
"epoch": 0.2659105229993699,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.000499626805300356,
|
||
|
|
"loss": 6.0439,
|
||
|
|
"mean_token_accuracy": 0.14277126342058183,
|
||
|
|
"num_tokens": 5845684.0,
|
||
|
|
"step": 3165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.094513893127441,
|
||
|
|
"epoch": 0.26633060281453474,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004996250792280732,
|
||
|
|
"loss": 5.9226,
|
||
|
|
"mean_token_accuracy": 0.13814914003014564,
|
||
|
|
"num_tokens": 5854905.0,
|
||
|
|
"step": 3170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.054658889770508,
|
||
|
|
"epoch": 0.26675068262969964,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004996233491766727,
|
||
|
|
"loss": 5.934,
|
||
|
|
"mean_token_accuracy": 0.14257717728614808,
|
||
|
|
"num_tokens": 5863654.0,
|
||
|
|
"step": 3175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.036546421051026,
|
||
|
|
"epoch": 0.26717076244486454,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004996216151461854,
|
||
|
|
"loss": 5.9289,
|
||
|
|
"mean_token_accuracy": 0.14137156009674073,
|
||
|
|
"num_tokens": 5872442.0,
|
||
|
|
"step": 3180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.089460277557373,
|
||
|
|
"epoch": 0.2675908422600294,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004996198771366417,
|
||
|
|
"loss": 5.8594,
|
||
|
|
"mean_token_accuracy": 0.14687168076634408,
|
||
|
|
"num_tokens": 5882372.0,
|
||
|
|
"step": 3185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.836459922790527,
|
||
|
|
"epoch": 0.2680109220751943,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996181351480726,
|
||
|
|
"loss": 5.6727,
|
||
|
|
"mean_token_accuracy": 0.15421667248010634,
|
||
|
|
"num_tokens": 5891113.0,
|
||
|
|
"step": 3190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.909378480911255,
|
||
|
|
"epoch": 0.2684310018903592,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004996163891805089,
|
||
|
|
"loss": 5.9167,
|
||
|
|
"mean_token_accuracy": 0.14929258525371553,
|
||
|
|
"num_tokens": 5899582.0,
|
||
|
|
"step": 3195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.088847398757935,
|
||
|
|
"epoch": 0.2688510817055241,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004996146392339815,
|
||
|
|
"loss": 5.8788,
|
||
|
|
"mean_token_accuracy": 0.137289460003376,
|
||
|
|
"num_tokens": 5908938.0,
|
||
|
|
"step": 3200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.025485897064209,
|
||
|
|
"epoch": 0.2692711615206889,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004996128853085215,
|
||
|
|
"loss": 5.8462,
|
||
|
|
"mean_token_accuracy": 0.14703118950128555,
|
||
|
|
"num_tokens": 5918055.0,
|
||
|
|
"step": 3205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.024847555160522,
|
||
|
|
"epoch": 0.2696912413358538,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004996111274041598,
|
||
|
|
"loss": 5.8169,
|
||
|
|
"mean_token_accuracy": 0.14159609079360963,
|
||
|
|
"num_tokens": 5926744.0,
|
||
|
|
"step": 3210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.007894611358642,
|
||
|
|
"epoch": 0.2701113211510187,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.0004996093655209277,
|
||
|
|
"loss": 5.9028,
|
||
|
|
"mean_token_accuracy": 0.1412175938487053,
|
||
|
|
"num_tokens": 5936521.0,
|
||
|
|
"step": 3215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.093644618988037,
|
||
|
|
"epoch": 0.27053140096618356,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004996075996588563,
|
||
|
|
"loss": 5.9689,
|
||
|
|
"mean_token_accuracy": 0.1381188787519932,
|
||
|
|
"num_tokens": 5945010.0,
|
||
|
|
"step": 3220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.014964437484741,
|
||
|
|
"epoch": 0.27095148078134845,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.000499605829817977,
|
||
|
|
"loss": 5.8629,
|
||
|
|
"mean_token_accuracy": 0.15120311975479125,
|
||
|
|
"num_tokens": 5953766.0,
|
||
|
|
"step": 3225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.982144498825074,
|
||
|
|
"epoch": 0.27137156059651335,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.000499604055998321,
|
||
|
|
"loss": 5.8001,
|
||
|
|
"mean_token_accuracy": 0.14623286202549934,
|
||
|
|
"num_tokens": 5962168.0,
|
||
|
|
"step": 3230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.941414022445679,
|
||
|
|
"epoch": 0.2717916404116782,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004996022781999198,
|
||
|
|
"loss": 5.8249,
|
||
|
|
"mean_token_accuracy": 0.14706685170531272,
|
||
|
|
"num_tokens": 5971627.0,
|
||
|
|
"step": 3235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.00689377784729,
|
||
|
|
"epoch": 0.2722117202268431,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.000499600496422805,
|
||
|
|
"loss": 5.8993,
|
||
|
|
"mean_token_accuracy": 0.14405820965766908,
|
||
|
|
"num_tokens": 5981775.0,
|
||
|
|
"step": 3240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973731327056885,
|
||
|
|
"epoch": 0.272631800042008,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000499598710667008,
|
||
|
|
"loss": 5.838,
|
||
|
|
"mean_token_accuracy": 0.1444271594285965,
|
||
|
|
"num_tokens": 5991097.0,
|
||
|
|
"step": 3245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973551654815674,
|
||
|
|
"epoch": 0.2730518798571729,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004995969209325604,
|
||
|
|
"loss": 5.8988,
|
||
|
|
"mean_token_accuracy": 0.14417145103216172,
|
||
|
|
"num_tokens": 5999517.0,
|
||
|
|
"step": 3250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.939422225952148,
|
||
|
|
"epoch": 0.2734719596723377,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004995951272194941,
|
||
|
|
"loss": 5.8778,
|
||
|
|
"mean_token_accuracy": 0.139290714263916,
|
||
|
|
"num_tokens": 6008545.0,
|
||
|
|
"step": 3255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.07567138671875,
|
||
|
|
"epoch": 0.2738920394875026,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004995933295278407,
|
||
|
|
"loss": 5.8603,
|
||
|
|
"mean_token_accuracy": 0.14346815124154091,
|
||
|
|
"num_tokens": 6017366.0,
|
||
|
|
"step": 3260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.989615488052368,
|
||
|
|
"epoch": 0.2743121193026675,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004995915278576321,
|
||
|
|
"loss": 5.8024,
|
||
|
|
"mean_token_accuracy": 0.14921536892652512,
|
||
|
|
"num_tokens": 6025597.0,
|
||
|
|
"step": 3265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.995965671539307,
|
||
|
|
"epoch": 0.27473219911783237,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004995897222089004,
|
||
|
|
"loss": 5.9055,
|
||
|
|
"mean_token_accuracy": 0.1438031278550625,
|
||
|
|
"num_tokens": 6034239.0,
|
||
|
|
"step": 3270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.17506217956543,
|
||
|
|
"epoch": 0.27515227893299726,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004995879125816772,
|
||
|
|
"loss": 5.9388,
|
||
|
|
"mean_token_accuracy": 0.14314718097448348,
|
||
|
|
"num_tokens": 6043837.0,
|
||
|
|
"step": 3275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.962472629547119,
|
||
|
|
"epoch": 0.27557235874816216,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.0004995860989759949,
|
||
|
|
"loss": 5.8709,
|
||
|
|
"mean_token_accuracy": 0.14632273614406585,
|
||
|
|
"num_tokens": 6053217.0,
|
||
|
|
"step": 3280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.029792261123657,
|
||
|
|
"epoch": 0.27599243856332706,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004995842813918855,
|
||
|
|
"loss": 5.8948,
|
||
|
|
"mean_token_accuracy": 0.1460642173886299,
|
||
|
|
"num_tokens": 6061553.0,
|
||
|
|
"step": 3285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.981232643127441,
|
||
|
|
"epoch": 0.2764125183784919,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004995824598293812,
|
||
|
|
"loss": 5.7712,
|
||
|
|
"mean_token_accuracy": 0.1501307800412178,
|
||
|
|
"num_tokens": 6070080.0,
|
||
|
|
"step": 3290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.045267486572266,
|
||
|
|
"epoch": 0.2768325981936568,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004995806342885142,
|
||
|
|
"loss": 5.9245,
|
||
|
|
"mean_token_accuracy": 0.14930349588394165,
|
||
|
|
"num_tokens": 6078438.0,
|
||
|
|
"step": 3295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0462220191955565,
|
||
|
|
"epoch": 0.2772526780088217,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000499578804769317,
|
||
|
|
"loss": 5.9092,
|
||
|
|
"mean_token_accuracy": 0.13776859119534493,
|
||
|
|
"num_tokens": 6087794.0,
|
||
|
|
"step": 3300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.104273176193237,
|
||
|
|
"epoch": 0.27767275782398654,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004995769712718218,
|
||
|
|
"loss": 5.9152,
|
||
|
|
"mean_token_accuracy": 0.14523780345916748,
|
||
|
|
"num_tokens": 6096709.0,
|
||
|
|
"step": 3305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.998883199691773,
|
||
|
|
"epoch": 0.27809283763915144,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004995751337960613,
|
||
|
|
"loss": 5.8495,
|
||
|
|
"mean_token_accuracy": 0.14268894568085672,
|
||
|
|
"num_tokens": 6105866.0,
|
||
|
|
"step": 3310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.001236534118652,
|
||
|
|
"epoch": 0.27851291745431633,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004995732923420679,
|
||
|
|
"loss": 5.8071,
|
||
|
|
"mean_token_accuracy": 0.15081177204847335,
|
||
|
|
"num_tokens": 6114882.0,
|
||
|
|
"step": 3315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.930415248870849,
|
||
|
|
"epoch": 0.2789329972694812,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004995714469098743,
|
||
|
|
"loss": 5.7725,
|
||
|
|
"mean_token_accuracy": 0.14834588766098022,
|
||
|
|
"num_tokens": 6123978.0,
|
||
|
|
"step": 3320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.966728734970093,
|
||
|
|
"epoch": 0.2793530770846461,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000499569597499513,
|
||
|
|
"loss": 5.9104,
|
||
|
|
"mean_token_accuracy": 0.1466206818819046,
|
||
|
|
"num_tokens": 6133246.0,
|
||
|
|
"step": 3325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.988458681106567,
|
||
|
|
"epoch": 0.27977315689981097,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0004995677441110172,
|
||
|
|
"loss": 5.7702,
|
||
|
|
"mean_token_accuracy": 0.14939837008714676,
|
||
|
|
"num_tokens": 6142865.0,
|
||
|
|
"step": 3330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.014625930786133,
|
||
|
|
"epoch": 0.28019323671497587,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004995658867444192,
|
||
|
|
"loss": 5.8654,
|
||
|
|
"mean_token_accuracy": 0.13881808668375015,
|
||
|
|
"num_tokens": 6152492.0,
|
||
|
|
"step": 3335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.975307273864746,
|
||
|
|
"epoch": 0.2806133165301407,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004995640253997523,
|
||
|
|
"loss": 5.8652,
|
||
|
|
"mean_token_accuracy": 0.1395415373146534,
|
||
|
|
"num_tokens": 6161953.0,
|
||
|
|
"step": 3340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.848208713531494,
|
||
|
|
"epoch": 0.2810333963453056,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.0004995621600770492,
|
||
|
|
"loss": 5.7285,
|
||
|
|
"mean_token_accuracy": 0.1502986840903759,
|
||
|
|
"num_tokens": 6171467.0,
|
||
|
|
"step": 3345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9759973049163815,
|
||
|
|
"epoch": 0.2814534761604705,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004995602907763431,
|
||
|
|
"loss": 5.8103,
|
||
|
|
"mean_token_accuracy": 0.1470308281481266,
|
||
|
|
"num_tokens": 6180646.0,
|
||
|
|
"step": 3350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.981297445297241,
|
||
|
|
"epoch": 0.28187355597563535,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004995584174976672,
|
||
|
|
"loss": 5.8029,
|
||
|
|
"mean_token_accuracy": 0.14213321059942247,
|
||
|
|
"num_tokens": 6189832.0,
|
||
|
|
"step": 3355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.966393995285034,
|
||
|
|
"epoch": 0.28229363579080025,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004995565402410544,
|
||
|
|
"loss": 5.7274,
|
||
|
|
"mean_token_accuracy": 0.1558822512626648,
|
||
|
|
"num_tokens": 6198339.0,
|
||
|
|
"step": 3360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.935036706924438,
|
||
|
|
"epoch": 0.28271371560596514,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004995546590065383,
|
||
|
|
"loss": 5.8126,
|
||
|
|
"mean_token_accuracy": 0.14656742215156554,
|
||
|
|
"num_tokens": 6207564.0,
|
||
|
|
"step": 3365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.000332260131836,
|
||
|
|
"epoch": 0.28313379542113004,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004995527737941518,
|
||
|
|
"loss": 5.8581,
|
||
|
|
"mean_token_accuracy": 0.14725540429353715,
|
||
|
|
"num_tokens": 6216056.0,
|
||
|
|
"step": 3370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.969868230819702,
|
||
|
|
"epoch": 0.2835538752362949,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004995508846039287,
|
||
|
|
"loss": 5.8259,
|
||
|
|
"mean_token_accuracy": 0.1441423200070858,
|
||
|
|
"num_tokens": 6225573.0,
|
||
|
|
"step": 3375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.054820203781128,
|
||
|
|
"epoch": 0.2839739550514598,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004995489914359023,
|
||
|
|
"loss": 5.9519,
|
||
|
|
"mean_token_accuracy": 0.13889921978116035,
|
||
|
|
"num_tokens": 6235057.0,
|
||
|
|
"step": 3380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0446230411529545,
|
||
|
|
"epoch": 0.2843940348666247,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004995470942901061,
|
||
|
|
"loss": 5.8635,
|
||
|
|
"mean_token_accuracy": 0.1436339296400547,
|
||
|
|
"num_tokens": 6244164.0,
|
||
|
|
"step": 3385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.036704730987549,
|
||
|
|
"epoch": 0.2848141146817895,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004995451931665738,
|
||
|
|
"loss": 5.8685,
|
||
|
|
"mean_token_accuracy": 0.14183638542890548,
|
||
|
|
"num_tokens": 6253095.0,
|
||
|
|
"step": 3390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9995965480804445,
|
||
|
|
"epoch": 0.2852341944969544,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.000499543288065339,
|
||
|
|
"loss": 5.817,
|
||
|
|
"mean_token_accuracy": 0.14616027921438218,
|
||
|
|
"num_tokens": 6261134.0,
|
||
|
|
"step": 3395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918176984786987,
|
||
|
|
"epoch": 0.2856542743121193,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004995413789864354,
|
||
|
|
"loss": 5.8093,
|
||
|
|
"mean_token_accuracy": 0.15111583173274995,
|
||
|
|
"num_tokens": 6270384.0,
|
||
|
|
"step": 3400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925231647491455,
|
||
|
|
"epoch": 0.28607435412728416,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004995394659298971,
|
||
|
|
"loss": 5.7581,
|
||
|
|
"mean_token_accuracy": 0.15247000753879547,
|
||
|
|
"num_tokens": 6279702.0,
|
||
|
|
"step": 3405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9355387687683105,
|
||
|
|
"epoch": 0.28649443394244906,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004995375488957576,
|
||
|
|
"loss": 5.8087,
|
||
|
|
"mean_token_accuracy": 0.14355491399765014,
|
||
|
|
"num_tokens": 6288297.0,
|
||
|
|
"step": 3410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.953091335296631,
|
||
|
|
"epoch": 0.28691451375761395,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.000499535627884051,
|
||
|
|
"loss": 5.8943,
|
||
|
|
"mean_token_accuracy": 0.13816075548529624,
|
||
|
|
"num_tokens": 6297288.0,
|
||
|
|
"step": 3415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.1151526927947994,
|
||
|
|
"epoch": 0.28733459357277885,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004995337028948115,
|
||
|
|
"loss": 5.912,
|
||
|
|
"mean_token_accuracy": 0.13960782587528228,
|
||
|
|
"num_tokens": 6306719.0,
|
||
|
|
"step": 3420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.956048154830933,
|
||
|
|
"epoch": 0.2877546733879437,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004995317739280731,
|
||
|
|
"loss": 5.7384,
|
||
|
|
"mean_token_accuracy": 0.15413220077753068,
|
||
|
|
"num_tokens": 6316639.0,
|
||
|
|
"step": 3425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9882111072540285,
|
||
|
|
"epoch": 0.2881747532031086,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004995298409838699,
|
||
|
|
"loss": 5.8729,
|
||
|
|
"mean_token_accuracy": 0.14296835884451867,
|
||
|
|
"num_tokens": 6326879.0,
|
||
|
|
"step": 3430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.922442245483398,
|
||
|
|
"epoch": 0.2885948330182735,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.000499527904062236,
|
||
|
|
"loss": 5.7735,
|
||
|
|
"mean_token_accuracy": 0.15226557850837708,
|
||
|
|
"num_tokens": 6335729.0,
|
||
|
|
"step": 3435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973740720748902,
|
||
|
|
"epoch": 0.28901491283343833,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004995259631632061,
|
||
|
|
"loss": 5.8537,
|
||
|
|
"mean_token_accuracy": 0.1386033460497856,
|
||
|
|
"num_tokens": 6345154.0,
|
||
|
|
"step": 3440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9747546195983885,
|
||
|
|
"epoch": 0.28943499264860323,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004995240182868143,
|
||
|
|
"loss": 5.8072,
|
||
|
|
"mean_token_accuracy": 0.14772575795650483,
|
||
|
|
"num_tokens": 6354309.0,
|
||
|
|
"step": 3445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.879770755767822,
|
||
|
|
"epoch": 0.2898550724637681,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004995220694330951,
|
||
|
|
"loss": 5.764,
|
||
|
|
"mean_token_accuracy": 0.14814788177609445,
|
||
|
|
"num_tokens": 6363389.0,
|
||
|
|
"step": 3450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.928126335144043,
|
||
|
|
"epoch": 0.290275152278933,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004995201166020832,
|
||
|
|
"loss": 5.8394,
|
||
|
|
"mean_token_accuracy": 0.1423036128282547,
|
||
|
|
"num_tokens": 6372475.0,
|
||
|
|
"step": 3455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.01046404838562,
|
||
|
|
"epoch": 0.29069523209409787,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.000499518159793813,
|
||
|
|
"loss": 5.7909,
|
||
|
|
"mean_token_accuracy": 0.15391181409358978,
|
||
|
|
"num_tokens": 6380906.0,
|
||
|
|
"step": 3460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.901024436950683,
|
||
|
|
"epoch": 0.29111531190926276,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000499516199008319,
|
||
|
|
"loss": 5.7893,
|
||
|
|
"mean_token_accuracy": 0.147665573656559,
|
||
|
|
"num_tokens": 6390085.0,
|
||
|
|
"step": 3465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.005919504165649,
|
||
|
|
"epoch": 0.29153539172442766,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004995142342456364,
|
||
|
|
"loss": 5.8587,
|
||
|
|
"mean_token_accuracy": 0.14177713990211488,
|
||
|
|
"num_tokens": 6399441.0,
|
||
|
|
"step": 3470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.037836742401123,
|
||
|
|
"epoch": 0.2919554715395925,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004995122655057997,
|
||
|
|
"loss": 5.9277,
|
||
|
|
"mean_token_accuracy": 0.14434729218482972,
|
||
|
|
"num_tokens": 6408995.0,
|
||
|
|
"step": 3475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8759626865386965,
|
||
|
|
"epoch": 0.2923755513547574,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004995102927888437,
|
||
|
|
"loss": 5.6769,
|
||
|
|
"mean_token_accuracy": 0.15346557945013045,
|
||
|
|
"num_tokens": 6418080.0,
|
||
|
|
"step": 3480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.980447435379029,
|
||
|
|
"epoch": 0.2927956311699223,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004995083160948036,
|
||
|
|
"loss": 5.8654,
|
||
|
|
"mean_token_accuracy": 0.14365637302398682,
|
||
|
|
"num_tokens": 6426732.0,
|
||
|
|
"step": 3485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918527126312256,
|
||
|
|
"epoch": 0.29321571098508714,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004995063354237141,
|
||
|
|
"loss": 5.8601,
|
||
|
|
"mean_token_accuracy": 0.14886348843574523,
|
||
|
|
"num_tokens": 6435957.0,
|
||
|
|
"step": 3490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.965629720687867,
|
||
|
|
"epoch": 0.29363579080025204,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004995043507756107,
|
||
|
|
"loss": 5.807,
|
||
|
|
"mean_token_accuracy": 0.14377646446228026,
|
||
|
|
"num_tokens": 6445642.0,
|
||
|
|
"step": 3495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.966208457946777,
|
||
|
|
"epoch": 0.29405587061541694,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004995023621505282,
|
||
|
|
"loss": 5.8468,
|
||
|
|
"mean_token_accuracy": 0.14531085640192032,
|
||
|
|
"num_tokens": 6454664.0,
|
||
|
|
"step": 3500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.846572160720825,
|
||
|
|
"epoch": 0.29447595043058183,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.000499500369548502,
|
||
|
|
"loss": 5.7718,
|
||
|
|
"mean_token_accuracy": 0.14744968637824057,
|
||
|
|
"num_tokens": 6463224.0,
|
||
|
|
"step": 3505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.10300350189209,
|
||
|
|
"epoch": 0.2948960302457467,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004994983729695674,
|
||
|
|
"loss": 5.9886,
|
||
|
|
"mean_token_accuracy": 0.13981593102216722,
|
||
|
|
"num_tokens": 6473112.0,
|
||
|
|
"step": 3510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.991326189041137,
|
||
|
|
"epoch": 0.2953161100609116,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004994963724137595,
|
||
|
|
"loss": 5.834,
|
||
|
|
"mean_token_accuracy": 0.14485643282532693,
|
||
|
|
"num_tokens": 6482062.0,
|
||
|
|
"step": 3515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.928696584701538,
|
||
|
|
"epoch": 0.29573618987607647,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004994943678811142,
|
||
|
|
"loss": 5.8362,
|
||
|
|
"mean_token_accuracy": 0.1416163809597492,
|
||
|
|
"num_tokens": 6490568.0,
|
||
|
|
"step": 3520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.993920183181762,
|
||
|
|
"epoch": 0.2961562696912413,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004994923593716667,
|
||
|
|
"loss": 5.8772,
|
||
|
|
"mean_token_accuracy": 0.14611808955669403,
|
||
|
|
"num_tokens": 6500815.0,
|
||
|
|
"step": 3525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.930905771255493,
|
||
|
|
"epoch": 0.2965763495064062,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004994903468854527,
|
||
|
|
"loss": 5.7544,
|
||
|
|
"mean_token_accuracy": 0.15672436058521272,
|
||
|
|
"num_tokens": 6509529.0,
|
||
|
|
"step": 3530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8914727687835695,
|
||
|
|
"epoch": 0.2969964293215711,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004994883304225077,
|
||
|
|
"loss": 5.8141,
|
||
|
|
"mean_token_accuracy": 0.1436660371720791,
|
||
|
|
"num_tokens": 6517934.0,
|
||
|
|
"step": 3535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.048480892181397,
|
||
|
|
"epoch": 0.297416509136736,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004994863099828675,
|
||
|
|
"loss": 5.7902,
|
||
|
|
"mean_token_accuracy": 0.14704177230596543,
|
||
|
|
"num_tokens": 6526098.0,
|
||
|
|
"step": 3540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.920773935317993,
|
||
|
|
"epoch": 0.29783658895190085,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.000499484285566568,
|
||
|
|
"loss": 5.8221,
|
||
|
|
"mean_token_accuracy": 0.14378595799207688,
|
||
|
|
"num_tokens": 6535831.0,
|
||
|
|
"step": 3545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.922514152526856,
|
||
|
|
"epoch": 0.29825666876706575,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 0.0004994822571736449,
|
||
|
|
"loss": 5.7254,
|
||
|
|
"mean_token_accuracy": 0.1482064038515091,
|
||
|
|
"num_tokens": 6545704.0,
|
||
|
|
"step": 3550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899800491333008,
|
||
|
|
"epoch": 0.29867674858223064,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004994802248041342,
|
||
|
|
"loss": 5.7535,
|
||
|
|
"mean_token_accuracy": 0.14916675686836242,
|
||
|
|
"num_tokens": 6554423.0,
|
||
|
|
"step": 3555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.932198619842529,
|
||
|
|
"epoch": 0.2990968283973955,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.000499478188458072,
|
||
|
|
"loss": 5.8022,
|
||
|
|
"mean_token_accuracy": 0.14890404120087625,
|
||
|
|
"num_tokens": 6563989.0,
|
||
|
|
"step": 3560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.968116617202758,
|
||
|
|
"epoch": 0.2995169082125604,
|
||
|
|
"grad_norm": 1.1171875,
|
||
|
|
"learning_rate": 0.0004994761481354943,
|
||
|
|
"loss": 5.9483,
|
||
|
|
"mean_token_accuracy": 0.1441567473113537,
|
||
|
|
"num_tokens": 6572745.0,
|
||
|
|
"step": 3565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.137206792831421,
|
||
|
|
"epoch": 0.2999369880277253,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004994741038364371,
|
||
|
|
"loss": 5.9343,
|
||
|
|
"mean_token_accuracy": 0.142555071413517,
|
||
|
|
"num_tokens": 6581723.0,
|
||
|
|
"step": 3570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.88220705986023,
|
||
|
|
"epoch": 0.3003570678428901,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004994720555609369,
|
||
|
|
"loss": 5.6659,
|
||
|
|
"mean_token_accuracy": 0.1542235180735588,
|
||
|
|
"num_tokens": 6590342.0,
|
||
|
|
"step": 3575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.829970359802246,
|
||
|
|
"epoch": 0.300777147658055,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004994700033090297,
|
||
|
|
"loss": 5.7501,
|
||
|
|
"mean_token_accuracy": 0.1582304283976555,
|
||
|
|
"num_tokens": 6599206.0,
|
||
|
|
"step": 3580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.041889762878418,
|
||
|
|
"epoch": 0.3011972274732199,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.000499467947080752,
|
||
|
|
"loss": 6.0318,
|
||
|
|
"mean_token_accuracy": 0.13561916202306748,
|
||
|
|
"num_tokens": 6608947.0,
|
||
|
|
"step": 3585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.06544942855835,
|
||
|
|
"epoch": 0.3016173072883848,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004994658868761402,
|
||
|
|
"loss": 5.8283,
|
||
|
|
"mean_token_accuracy": 0.15170362889766692,
|
||
|
|
"num_tokens": 6618378.0,
|
||
|
|
"step": 3590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.914470624923706,
|
||
|
|
"epoch": 0.30203738710354966,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004994638226952307,
|
||
|
|
"loss": 5.8836,
|
||
|
|
"mean_token_accuracy": 0.14195557832717895,
|
||
|
|
"num_tokens": 6627527.0,
|
||
|
|
"step": 3595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.982400751113891,
|
||
|
|
"epoch": 0.30245746691871456,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004994617545380604,
|
||
|
|
"loss": 5.8286,
|
||
|
|
"mean_token_accuracy": 0.14527858346700667,
|
||
|
|
"num_tokens": 6636964.0,
|
||
|
|
"step": 3600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.908453559875488,
|
||
|
|
"epoch": 0.30287754673387945,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004994596824046656,
|
||
|
|
"loss": 5.7718,
|
||
|
|
"mean_token_accuracy": 0.14911266565322875,
|
||
|
|
"num_tokens": 6646074.0,
|
||
|
|
"step": 3605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99076018333435,
|
||
|
|
"epoch": 0.3032976265490443,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.000499457606295083,
|
||
|
|
"loss": 5.8447,
|
||
|
|
"mean_token_accuracy": 0.14240661412477493,
|
||
|
|
"num_tokens": 6655027.0,
|
||
|
|
"step": 3610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.808787536621094,
|
||
|
|
"epoch": 0.3037177063642092,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004994555262093495,
|
||
|
|
"loss": 5.6321,
|
||
|
|
"mean_token_accuracy": 0.1570141136646271,
|
||
|
|
"num_tokens": 6663747.0,
|
||
|
|
"step": 3615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.046371412277222,
|
||
|
|
"epoch": 0.3041377861793741,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000499453442147502,
|
||
|
|
"loss": 5.9593,
|
||
|
|
"mean_token_accuracy": 0.1389522023499012,
|
||
|
|
"num_tokens": 6672922.0,
|
||
|
|
"step": 3620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9334362030029295,
|
||
|
|
"epoch": 0.304557865994539,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004994513541095773,
|
||
|
|
"loss": 5.7735,
|
||
|
|
"mean_token_accuracy": 0.15685406178236008,
|
||
|
|
"num_tokens": 6682233.0,
|
||
|
|
"step": 3625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.922385549545288,
|
||
|
|
"epoch": 0.30497794580970383,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004994492620956126,
|
||
|
|
"loss": 5.8112,
|
||
|
|
"mean_token_accuracy": 0.15047757476568221,
|
||
|
|
"num_tokens": 6691593.0,
|
||
|
|
"step": 3630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.917299842834472,
|
||
|
|
"epoch": 0.30539802562486873,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004994471661056445,
|
||
|
|
"loss": 5.8207,
|
||
|
|
"mean_token_accuracy": 0.15176298022270202,
|
||
|
|
"num_tokens": 6701318.0,
|
||
|
|
"step": 3635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.031417036056519,
|
||
|
|
"epoch": 0.3058181054400336,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004994450661397106,
|
||
|
|
"loss": 5.8199,
|
||
|
|
"mean_token_accuracy": 0.1515482097864151,
|
||
|
|
"num_tokens": 6710059.0,
|
||
|
|
"step": 3640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.035120582580566,
|
||
|
|
"epoch": 0.30623818525519847,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000499442962197848,
|
||
|
|
"loss": 5.9111,
|
||
|
|
"mean_token_accuracy": 0.14002010971307755,
|
||
|
|
"num_tokens": 6719811.0,
|
||
|
|
"step": 3645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.872648668289185,
|
||
|
|
"epoch": 0.30665826507036337,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004994408542800937,
|
||
|
|
"loss": 5.7991,
|
||
|
|
"mean_token_accuracy": 0.15095670521259308,
|
||
|
|
"num_tokens": 6728789.0,
|
||
|
|
"step": 3650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.943379068374634,
|
||
|
|
"epoch": 0.30707834488552826,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004994387423864855,
|
||
|
|
"loss": 5.7834,
|
||
|
|
"mean_token_accuracy": 0.1460746333003044,
|
||
|
|
"num_tokens": 6737706.0,
|
||
|
|
"step": 3655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.941844272613525,
|
||
|
|
"epoch": 0.3074984247006931,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004994366265170603,
|
||
|
|
"loss": 5.7446,
|
||
|
|
"mean_token_accuracy": 0.16055794954299926,
|
||
|
|
"num_tokens": 6746861.0,
|
||
|
|
"step": 3660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.028618669509887,
|
||
|
|
"epoch": 0.307918504515858,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004994345066718558,
|
||
|
|
"loss": 5.916,
|
||
|
|
"mean_token_accuracy": 0.14116688221693038,
|
||
|
|
"num_tokens": 6755242.0,
|
||
|
|
"step": 3665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.008127069473266,
|
||
|
|
"epoch": 0.3083385843310229,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004994323828509098,
|
||
|
|
"loss": 5.8727,
|
||
|
|
"mean_token_accuracy": 0.14286566898226738,
|
||
|
|
"num_tokens": 6764549.0,
|
||
|
|
"step": 3670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.929146242141724,
|
||
|
|
"epoch": 0.3087586641461878,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004994302550542596,
|
||
|
|
"loss": 5.8471,
|
||
|
|
"mean_token_accuracy": 0.1538454920053482,
|
||
|
|
"num_tokens": 6774123.0,
|
||
|
|
"step": 3675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.80585126876831,
|
||
|
|
"epoch": 0.30917874396135264,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000499428123281943,
|
||
|
|
"loss": 5.6317,
|
||
|
|
"mean_token_accuracy": 0.1558361306786537,
|
||
|
|
"num_tokens": 6782922.0,
|
||
|
|
"step": 3680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925417232513428,
|
||
|
|
"epoch": 0.30959882377651754,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004994259875339978,
|
||
|
|
"loss": 5.8838,
|
||
|
|
"mean_token_accuracy": 0.14831040799617767,
|
||
|
|
"num_tokens": 6792042.0,
|
||
|
|
"step": 3685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.067014694213867,
|
||
|
|
"epoch": 0.31001890359168244,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004994238478104617,
|
||
|
|
"loss": 5.872,
|
||
|
|
"mean_token_accuracy": 0.14466599076986314,
|
||
|
|
"num_tokens": 6800994.0,
|
||
|
|
"step": 3690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.913062810897827,
|
||
|
|
"epoch": 0.3104389834068473,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004994217041113727,
|
||
|
|
"loss": 5.8012,
|
||
|
|
"mean_token_accuracy": 0.15395486801862718,
|
||
|
|
"num_tokens": 6809938.0,
|
||
|
|
"step": 3695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.028704833984375,
|
||
|
|
"epoch": 0.3108590632220122,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004994195564367688,
|
||
|
|
"loss": 5.9148,
|
||
|
|
"mean_token_accuracy": 0.14361433312296867,
|
||
|
|
"num_tokens": 6820289.0,
|
||
|
|
"step": 3700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.998479652404785,
|
||
|
|
"epoch": 0.3112791430371771,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004994174047866882,
|
||
|
|
"loss": 5.7538,
|
||
|
|
"mean_token_accuracy": 0.15162525251507758,
|
||
|
|
"num_tokens": 6830068.0,
|
||
|
|
"step": 3705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830403566360474,
|
||
|
|
"epoch": 0.3116992228523419,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004994152491611686,
|
||
|
|
"loss": 5.7916,
|
||
|
|
"mean_token_accuracy": 0.14659319072961807,
|
||
|
|
"num_tokens": 6838591.0,
|
||
|
|
"step": 3710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.838834381103515,
|
||
|
|
"epoch": 0.3121193026675068,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004994130895602485,
|
||
|
|
"loss": 5.7583,
|
||
|
|
"mean_token_accuracy": 0.14570422172546388,
|
||
|
|
"num_tokens": 6847796.0,
|
||
|
|
"step": 3715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.02327971458435,
|
||
|
|
"epoch": 0.3125393824826717,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.000499410925983966,
|
||
|
|
"loss": 5.8457,
|
||
|
|
"mean_token_accuracy": 0.14952262938022615,
|
||
|
|
"num_tokens": 6856585.0,
|
||
|
|
"step": 3720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.887494659423828,
|
||
|
|
"epoch": 0.3129594622978366,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004994087584323596,
|
||
|
|
"loss": 5.7583,
|
||
|
|
"mean_token_accuracy": 0.15517981797456742,
|
||
|
|
"num_tokens": 6865757.0,
|
||
|
|
"step": 3725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.853988265991211,
|
||
|
|
"epoch": 0.31337954211300145,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004994065869054676,
|
||
|
|
"loss": 5.796,
|
||
|
|
"mean_token_accuracy": 0.1451224982738495,
|
||
|
|
"num_tokens": 6875371.0,
|
||
|
|
"step": 3730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.02379846572876,
|
||
|
|
"epoch": 0.31379962192816635,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004994044114033283,
|
||
|
|
"loss": 5.8687,
|
||
|
|
"mean_token_accuracy": 0.1440061092376709,
|
||
|
|
"num_tokens": 6884050.0,
|
||
|
|
"step": 3735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.026759815216065,
|
||
|
|
"epoch": 0.31421970174333125,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004994022319259806,
|
||
|
|
"loss": 5.8372,
|
||
|
|
"mean_token_accuracy": 0.14598554819822313,
|
||
|
|
"num_tokens": 6893079.0,
|
||
|
|
"step": 3740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.911620283126831,
|
||
|
|
"epoch": 0.3146397815584961,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004994000484734629,
|
||
|
|
"loss": 5.9136,
|
||
|
|
"mean_token_accuracy": 0.15156169682741166,
|
||
|
|
"num_tokens": 6903100.0,
|
||
|
|
"step": 3745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.923766088485718,
|
||
|
|
"epoch": 0.315059861373661,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004993978610458137,
|
||
|
|
"loss": 5.7654,
|
||
|
|
"mean_token_accuracy": 0.15068738907575607,
|
||
|
|
"num_tokens": 6912164.0,
|
||
|
|
"step": 3750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.878131437301636,
|
||
|
|
"epoch": 0.3154799411888259,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004993956696430721,
|
||
|
|
"loss": 5.7781,
|
||
|
|
"mean_token_accuracy": 0.1453731819987297,
|
||
|
|
"num_tokens": 6921183.0,
|
||
|
|
"step": 3755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.950732278823852,
|
||
|
|
"epoch": 0.3159000210039908,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004993934742652768,
|
||
|
|
"loss": 5.8422,
|
||
|
|
"mean_token_accuracy": 0.14924204498529434,
|
||
|
|
"num_tokens": 6931325.0,
|
||
|
|
"step": 3760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.98630108833313,
|
||
|
|
"epoch": 0.3163201008191556,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004993912749124665,
|
||
|
|
"loss": 5.7579,
|
||
|
|
"mean_token_accuracy": 0.15365685075521468,
|
||
|
|
"num_tokens": 6940234.0,
|
||
|
|
"step": 3765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.933948040008545,
|
||
|
|
"epoch": 0.3167401806343205,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004993890715846804,
|
||
|
|
"loss": 5.8442,
|
||
|
|
"mean_token_accuracy": 0.1472316324710846,
|
||
|
|
"num_tokens": 6949067.0,
|
||
|
|
"step": 3770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.98266453742981,
|
||
|
|
"epoch": 0.3171602604494854,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004993868642819574,
|
||
|
|
"loss": 5.8092,
|
||
|
|
"mean_token_accuracy": 0.14614944905042648,
|
||
|
|
"num_tokens": 6959085.0,
|
||
|
|
"step": 3775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.905980443954467,
|
||
|
|
"epoch": 0.31758034026465026,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004993846530043367,
|
||
|
|
"loss": 5.8539,
|
||
|
|
"mean_token_accuracy": 0.14434425979852678,
|
||
|
|
"num_tokens": 6967392.0,
|
||
|
|
"step": 3780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.910531997680664,
|
||
|
|
"epoch": 0.31800042007981516,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004993824377518574,
|
||
|
|
"loss": 5.7851,
|
||
|
|
"mean_token_accuracy": 0.1514693483710289,
|
||
|
|
"num_tokens": 6976369.0,
|
||
|
|
"step": 3785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.976119804382324,
|
||
|
|
"epoch": 0.31842049989498006,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004993802185245587,
|
||
|
|
"loss": 5.8013,
|
||
|
|
"mean_token_accuracy": 0.14934585690498353,
|
||
|
|
"num_tokens": 6985889.0,
|
||
|
|
"step": 3790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925661182403564,
|
||
|
|
"epoch": 0.3188405797101449,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.00049937799532248,
|
||
|
|
"loss": 5.8359,
|
||
|
|
"mean_token_accuracy": 0.13918048441410064,
|
||
|
|
"num_tokens": 6995396.0,
|
||
|
|
"step": 3795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.0729657173156735,
|
||
|
|
"epoch": 0.3192606595253098,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004993757681456607,
|
||
|
|
"loss": 5.8718,
|
||
|
|
"mean_token_accuracy": 0.1478106528520584,
|
||
|
|
"num_tokens": 7004666.0,
|
||
|
|
"step": 3800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.967416000366211,
|
||
|
|
"epoch": 0.3196807393404747,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004993735369941401,
|
||
|
|
"loss": 5.8998,
|
||
|
|
"mean_token_accuracy": 0.14525311812758446,
|
||
|
|
"num_tokens": 7014608.0,
|
||
|
|
"step": 3805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.966092729568482,
|
||
|
|
"epoch": 0.3201008191556396,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004993713018679579,
|
||
|
|
"loss": 5.7888,
|
||
|
|
"mean_token_accuracy": 0.14646613076329232,
|
||
|
|
"num_tokens": 7023671.0,
|
||
|
|
"step": 3810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.904713773727417,
|
||
|
|
"epoch": 0.32052089897080444,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004993690627671536,
|
||
|
|
"loss": 5.8148,
|
||
|
|
"mean_token_accuracy": 0.1434755489230156,
|
||
|
|
"num_tokens": 7033786.0,
|
||
|
|
"step": 3815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.907800912857056,
|
||
|
|
"epoch": 0.32094097878596933,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004993668196917669,
|
||
|
|
"loss": 5.7268,
|
||
|
|
"mean_token_accuracy": 0.15316082686185836,
|
||
|
|
"num_tokens": 7042162.0,
|
||
|
|
"step": 3820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.994227170944214,
|
||
|
|
"epoch": 0.32136105860113423,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004993645726418375,
|
||
|
|
"loss": 5.8618,
|
||
|
|
"mean_token_accuracy": 0.15052291825413705,
|
||
|
|
"num_tokens": 7051903.0,
|
||
|
|
"step": 3825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.900808525085449,
|
||
|
|
"epoch": 0.3217811384162991,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004993623216174053,
|
||
|
|
"loss": 5.7121,
|
||
|
|
"mean_token_accuracy": 0.161135034263134,
|
||
|
|
"num_tokens": 7060229.0,
|
||
|
|
"step": 3830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.845855093002319,
|
||
|
|
"epoch": 0.32220121823146397,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.00049936006661851,
|
||
|
|
"loss": 5.7989,
|
||
|
|
"mean_token_accuracy": 0.1526742696762085,
|
||
|
|
"num_tokens": 7069040.0,
|
||
|
|
"step": 3835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.919027471542359,
|
||
|
|
"epoch": 0.32262129804662887,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004993578076451917,
|
||
|
|
"loss": 5.6805,
|
||
|
|
"mean_token_accuracy": 0.15347311198711394,
|
||
|
|
"num_tokens": 7078409.0,
|
||
|
|
"step": 3840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.853667831420898,
|
||
|
|
"epoch": 0.32304137786179377,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004993555446974903,
|
||
|
|
"loss": 5.765,
|
||
|
|
"mean_token_accuracy": 0.14782839864492417,
|
||
|
|
"num_tokens": 7087983.0,
|
||
|
|
"step": 3845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.853893089294433,
|
||
|
|
"epoch": 0.3234614576769586,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.000499353277775446,
|
||
|
|
"loss": 5.7182,
|
||
|
|
"mean_token_accuracy": 0.1580560803413391,
|
||
|
|
"num_tokens": 7097277.0,
|
||
|
|
"step": 3850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.87832407951355,
|
||
|
|
"epoch": 0.3238815374921235,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004993510068790989,
|
||
|
|
"loss": 5.6187,
|
||
|
|
"mean_token_accuracy": 0.16494725197553634,
|
||
|
|
"num_tokens": 7105918.0,
|
||
|
|
"step": 3855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8204621315002445,
|
||
|
|
"epoch": 0.3243016173072884,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004993487320084892,
|
||
|
|
"loss": 5.6885,
|
||
|
|
"mean_token_accuracy": 0.1581684559583664,
|
||
|
|
"num_tokens": 7115049.0,
|
||
|
|
"step": 3860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.950232267379761,
|
||
|
|
"epoch": 0.32472169712245325,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004993464531636573,
|
||
|
|
"loss": 5.7875,
|
||
|
|
"mean_token_accuracy": 0.1498127706348896,
|
||
|
|
"num_tokens": 7124862.0,
|
||
|
|
"step": 3865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.82954216003418,
|
||
|
|
"epoch": 0.32514177693761814,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004993441703446435,
|
||
|
|
"loss": 5.6777,
|
||
|
|
"mean_token_accuracy": 0.1620057240128517,
|
||
|
|
"num_tokens": 7133280.0,
|
||
|
|
"step": 3870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.929150485992432,
|
||
|
|
"epoch": 0.32556185675278304,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004993418835514882,
|
||
|
|
"loss": 5.8773,
|
||
|
|
"mean_token_accuracy": 0.14564588218927382,
|
||
|
|
"num_tokens": 7142446.0,
|
||
|
|
"step": 3875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9440654754638675,
|
||
|
|
"epoch": 0.3259819365679479,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004993395927842321,
|
||
|
|
"loss": 5.7755,
|
||
|
|
"mean_token_accuracy": 0.14392856359481812,
|
||
|
|
"num_tokens": 7152143.0,
|
||
|
|
"step": 3880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.021526956558228,
|
||
|
|
"epoch": 0.3264020163831128,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004993372980429155,
|
||
|
|
"loss": 5.8501,
|
||
|
|
"mean_token_accuracy": 0.14762358814477922,
|
||
|
|
"num_tokens": 7162046.0,
|
||
|
|
"step": 3885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.937510824203491,
|
||
|
|
"epoch": 0.3268220961982777,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004993349993275792,
|
||
|
|
"loss": 5.7358,
|
||
|
|
"mean_token_accuracy": 0.1501179426908493,
|
||
|
|
"num_tokens": 7171557.0,
|
||
|
|
"step": 3890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.722299528121948,
|
||
|
|
"epoch": 0.3272421760134426,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.0004993326966382639,
|
||
|
|
"loss": 5.6455,
|
||
|
|
"mean_token_accuracy": 0.15715345591306687,
|
||
|
|
"num_tokens": 7180927.0,
|
||
|
|
"step": 3895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.841052865982055,
|
||
|
|
"epoch": 0.3276622558286074,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004993303899750104,
|
||
|
|
"loss": 5.728,
|
||
|
|
"mean_token_accuracy": 0.15390928834676743,
|
||
|
|
"num_tokens": 7189552.0,
|
||
|
|
"step": 3900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.984076976776123,
|
||
|
|
"epoch": 0.3280823356437723,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004993280793378595,
|
||
|
|
"loss": 5.7447,
|
||
|
|
"mean_token_accuracy": 0.14799359515309335,
|
||
|
|
"num_tokens": 7197857.0,
|
||
|
|
"step": 3905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.883258295059204,
|
||
|
|
"epoch": 0.3285024154589372,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004993257647268522,
|
||
|
|
"loss": 5.7153,
|
||
|
|
"mean_token_accuracy": 0.15892730355262757,
|
||
|
|
"num_tokens": 7206785.0,
|
||
|
|
"step": 3910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8749652862548825,
|
||
|
|
"epoch": 0.32892249527410206,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004993234461420295,
|
||
|
|
"loss": 5.8032,
|
||
|
|
"mean_token_accuracy": 0.1540107510983944,
|
||
|
|
"num_tokens": 7216360.0,
|
||
|
|
"step": 3915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.903149938583374,
|
||
|
|
"epoch": 0.32934257508926695,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004993211235834326,
|
||
|
|
"loss": 5.6111,
|
||
|
|
"mean_token_accuracy": 0.1713676080107689,
|
||
|
|
"num_tokens": 7224890.0,
|
||
|
|
"step": 3920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.803111982345581,
|
||
|
|
"epoch": 0.32976265490443185,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004993187970511023,
|
||
|
|
"loss": 5.6647,
|
||
|
|
"mean_token_accuracy": 0.17485086023807525,
|
||
|
|
"num_tokens": 7234442.0,
|
||
|
|
"step": 3925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.873620986938477,
|
||
|
|
"epoch": 0.33018273471959675,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004993164665450801,
|
||
|
|
"loss": 5.8228,
|
||
|
|
"mean_token_accuracy": 0.15156899392604828,
|
||
|
|
"num_tokens": 7244023.0,
|
||
|
|
"step": 3930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.843383169174194,
|
||
|
|
"epoch": 0.3306028145347616,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004993141320654072,
|
||
|
|
"loss": 5.6665,
|
||
|
|
"mean_token_accuracy": 0.15884078443050384,
|
||
|
|
"num_tokens": 7253548.0,
|
||
|
|
"step": 3935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8344789981842045,
|
||
|
|
"epoch": 0.3310228943499265,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.000499311793612125,
|
||
|
|
"loss": 5.7347,
|
||
|
|
"mean_token_accuracy": 0.15194563269615174,
|
||
|
|
"num_tokens": 7262962.0,
|
||
|
|
"step": 3940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9449968338012695,
|
||
|
|
"epoch": 0.3314429741650914,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004993094511852748,
|
||
|
|
"loss": 5.7609,
|
||
|
|
"mean_token_accuracy": 0.14924739301204681,
|
||
|
|
"num_tokens": 7272234.0,
|
||
|
|
"step": 3945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.968133401870728,
|
||
|
|
"epoch": 0.33186305398025623,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004993071047848983,
|
||
|
|
"loss": 5.7413,
|
||
|
|
"mean_token_accuracy": 0.15319221317768097,
|
||
|
|
"num_tokens": 7281524.0,
|
||
|
|
"step": 3950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.790039682388306,
|
||
|
|
"epoch": 0.3322831337954211,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004993047544110368,
|
||
|
|
"loss": 5.6528,
|
||
|
|
"mean_token_accuracy": 0.15719158425927163,
|
||
|
|
"num_tokens": 7289601.0,
|
||
|
|
"step": 3955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.721573781967163,
|
||
|
|
"epoch": 0.332703213610586,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004993024000637321,
|
||
|
|
"loss": 5.6074,
|
||
|
|
"mean_token_accuracy": 0.16373219192028046,
|
||
|
|
"num_tokens": 7298508.0,
|
||
|
|
"step": 3960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.854639863967895,
|
||
|
|
"epoch": 0.33312329342575087,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004993000417430259,
|
||
|
|
"loss": 5.8333,
|
||
|
|
"mean_token_accuracy": 0.14586606696248056,
|
||
|
|
"num_tokens": 7309065.0,
|
||
|
|
"step": 3965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.050255537033081,
|
||
|
|
"epoch": 0.33354337324091576,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.00049929767944896,
|
||
|
|
"loss": 5.8607,
|
||
|
|
"mean_token_accuracy": 0.14968539252877236,
|
||
|
|
"num_tokens": 7319669.0,
|
||
|
|
"step": 3970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.973075866699219,
|
||
|
|
"epoch": 0.33396345305608066,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004992953131815761,
|
||
|
|
"loss": 5.7964,
|
||
|
|
"mean_token_accuracy": 0.14924187809228898,
|
||
|
|
"num_tokens": 7328425.0,
|
||
|
|
"step": 3975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.858473682403565,
|
||
|
|
"epoch": 0.33438353287124556,
|
||
|
|
"grad_norm": 1.0703125,
|
||
|
|
"learning_rate": 0.0004992929429409164,
|
||
|
|
"loss": 5.6701,
|
||
|
|
"mean_token_accuracy": 0.15970652550458908,
|
||
|
|
"num_tokens": 7337369.0,
|
||
|
|
"step": 3980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.832104206085205,
|
||
|
|
"epoch": 0.3348036126864104,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992905687270225,
|
||
|
|
"loss": 5.7375,
|
||
|
|
"mean_token_accuracy": 0.15307654216885566,
|
||
|
|
"num_tokens": 7346829.0,
|
||
|
|
"step": 3985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9267027378082275,
|
||
|
|
"epoch": 0.3352236925015753,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004992881905399368,
|
||
|
|
"loss": 5.7952,
|
||
|
|
"mean_token_accuracy": 0.14916737228631974,
|
||
|
|
"num_tokens": 7355976.0,
|
||
|
|
"step": 3990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.941111850738525,
|
||
|
|
"epoch": 0.3356437723167402,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004992858083797013,
|
||
|
|
"loss": 5.7675,
|
||
|
|
"mean_token_accuracy": 0.1473349630832672,
|
||
|
|
"num_tokens": 7365210.0,
|
||
|
|
"step": 3995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9041369438171385,
|
||
|
|
"epoch": 0.33606385213190504,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004992834222463581,
|
||
|
|
"loss": 5.8093,
|
||
|
|
"mean_token_accuracy": 0.14046019837260246,
|
||
|
|
"num_tokens": 7374175.0,
|
||
|
|
"step": 4000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.923312139511109,
|
||
|
|
"epoch": 0.33648393194706994,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004992810321399496,
|
||
|
|
"loss": 5.8383,
|
||
|
|
"mean_token_accuracy": 0.147621788084507,
|
||
|
|
"num_tokens": 7383302.0,
|
||
|
|
"step": 4005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.99611988067627,
|
||
|
|
"epoch": 0.33690401176223483,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004992786380605182,
|
||
|
|
"loss": 5.8018,
|
||
|
|
"mean_token_accuracy": 0.15006497725844384,
|
||
|
|
"num_tokens": 7392746.0,
|
||
|
|
"step": 4010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.865422248840332,
|
||
|
|
"epoch": 0.33732409157739973,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004992762400081062,
|
||
|
|
"loss": 5.6537,
|
||
|
|
"mean_token_accuracy": 0.1529911682009697,
|
||
|
|
"num_tokens": 7401604.0,
|
||
|
|
"step": 4015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.859767580032349,
|
||
|
|
"epoch": 0.3377441713925646,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004992738379827559,
|
||
|
|
"loss": 5.7575,
|
||
|
|
"mean_token_accuracy": 0.15247822627425195,
|
||
|
|
"num_tokens": 7410594.0,
|
||
|
|
"step": 4020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.920141792297363,
|
||
|
|
"epoch": 0.33816425120772947,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004992714319845101,
|
||
|
|
"loss": 5.658,
|
||
|
|
"mean_token_accuracy": 0.16050563454627992,
|
||
|
|
"num_tokens": 7418831.0,
|
||
|
|
"step": 4025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809229993820191,
|
||
|
|
"epoch": 0.33858433102289437,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992690220134116,
|
||
|
|
"loss": 5.7047,
|
||
|
|
"mean_token_accuracy": 0.15451119393110274,
|
||
|
|
"num_tokens": 7427731.0,
|
||
|
|
"step": 4030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.96991548538208,
|
||
|
|
"epoch": 0.3390044108380592,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004992666080695027,
|
||
|
|
"loss": 5.8101,
|
||
|
|
"mean_token_accuracy": 0.14591643139719962,
|
||
|
|
"num_tokens": 7436447.0,
|
||
|
|
"step": 4035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9149298667907715,
|
||
|
|
"epoch": 0.3394244906532241,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004992641901528262,
|
||
|
|
"loss": 5.7195,
|
||
|
|
"mean_token_accuracy": 0.15583046823740004,
|
||
|
|
"num_tokens": 7445352.0,
|
||
|
|
"step": 4040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.908085584640503,
|
||
|
|
"epoch": 0.339844570468389,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004992617682634252,
|
||
|
|
"loss": 5.7887,
|
||
|
|
"mean_token_accuracy": 0.1540717288851738,
|
||
|
|
"num_tokens": 7454298.0,
|
||
|
|
"step": 4045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.891385746002197,
|
||
|
|
"epoch": 0.34026465028355385,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004992593424013424,
|
||
|
|
"loss": 5.7978,
|
||
|
|
"mean_token_accuracy": 0.15331364274024964,
|
||
|
|
"num_tokens": 7463543.0,
|
||
|
|
"step": 4050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.913450384140015,
|
||
|
|
"epoch": 0.34068473009871875,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004992569125666209,
|
||
|
|
"loss": 5.8148,
|
||
|
|
"mean_token_accuracy": 0.14611926972866057,
|
||
|
|
"num_tokens": 7472701.0,
|
||
|
|
"step": 4055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.010456657409668,
|
||
|
|
"epoch": 0.34110480991388364,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992544787593037,
|
||
|
|
"loss": 5.817,
|
||
|
|
"mean_token_accuracy": 0.14246124625205994,
|
||
|
|
"num_tokens": 7481123.0,
|
||
|
|
"step": 4060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.905852317810059,
|
||
|
|
"epoch": 0.34152488972904854,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992520409794338,
|
||
|
|
"loss": 5.8641,
|
||
|
|
"mean_token_accuracy": 0.1508338287472725,
|
||
|
|
"num_tokens": 7490439.0,
|
||
|
|
"step": 4065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.901952314376831,
|
||
|
|
"epoch": 0.3419449695442134,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004992495992270544,
|
||
|
|
"loss": 5.7351,
|
||
|
|
"mean_token_accuracy": 0.1509379267692566,
|
||
|
|
"num_tokens": 7499326.0,
|
||
|
|
"step": 4070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.938205337524414,
|
||
|
|
"epoch": 0.3423650493593783,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992471535022089,
|
||
|
|
"loss": 5.7857,
|
||
|
|
"mean_token_accuracy": 0.1451237343251705,
|
||
|
|
"num_tokens": 7509407.0,
|
||
|
|
"step": 4075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.869676685333252,
|
||
|
|
"epoch": 0.3427851291745432,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004992447038049405,
|
||
|
|
"loss": 5.829,
|
||
|
|
"mean_token_accuracy": 0.14850043952465058,
|
||
|
|
"num_tokens": 7518443.0,
|
||
|
|
"step": 4080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.861940097808838,
|
||
|
|
"epoch": 0.343205208989708,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004992422501352927,
|
||
|
|
"loss": 5.6977,
|
||
|
|
"mean_token_accuracy": 0.15755705237388612,
|
||
|
|
"num_tokens": 7527609.0,
|
||
|
|
"step": 4085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.978248167037964,
|
||
|
|
"epoch": 0.3436252888048729,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004992397924933089,
|
||
|
|
"loss": 5.7788,
|
||
|
|
"mean_token_accuracy": 0.15250536054372787,
|
||
|
|
"num_tokens": 7536890.0,
|
||
|
|
"step": 4090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.899935388565064,
|
||
|
|
"epoch": 0.3440453686200378,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004992373308790325,
|
||
|
|
"loss": 5.731,
|
||
|
|
"mean_token_accuracy": 0.1621832400560379,
|
||
|
|
"num_tokens": 7546509.0,
|
||
|
|
"step": 4095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818875694274903,
|
||
|
|
"epoch": 0.3444654484352027,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004992348652925074,
|
||
|
|
"loss": 5.7667,
|
||
|
|
"mean_token_accuracy": 0.15332106947898866,
|
||
|
|
"num_tokens": 7555336.0,
|
||
|
|
"step": 4100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.907353639602661,
|
||
|
|
"epoch": 0.34488552825036756,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004992323957337771,
|
||
|
|
"loss": 5.7278,
|
||
|
|
"mean_token_accuracy": 0.1509070634841919,
|
||
|
|
"num_tokens": 7565210.0,
|
||
|
|
"step": 4105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.929575109481812,
|
||
|
|
"epoch": 0.34530560806553245,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004992299222028855,
|
||
|
|
"loss": 5.8127,
|
||
|
|
"mean_token_accuracy": 0.15723925679922104,
|
||
|
|
"num_tokens": 7574516.0,
|
||
|
|
"step": 4110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.839164924621582,
|
||
|
|
"epoch": 0.34572568788069735,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004992274446998761,
|
||
|
|
"loss": 5.6588,
|
||
|
|
"mean_token_accuracy": 0.1544717237353325,
|
||
|
|
"num_tokens": 7583219.0,
|
||
|
|
"step": 4115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.916603851318359,
|
||
|
|
"epoch": 0.3461457676958622,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004992249632247929,
|
||
|
|
"loss": 5.902,
|
||
|
|
"mean_token_accuracy": 0.14321533888578414,
|
||
|
|
"num_tokens": 7592050.0,
|
||
|
|
"step": 4120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9809043407440186,
|
||
|
|
"epoch": 0.3465658475110271,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004992224777776802,
|
||
|
|
"loss": 5.732,
|
||
|
|
"mean_token_accuracy": 0.1493101716041565,
|
||
|
|
"num_tokens": 7600718.0,
|
||
|
|
"step": 4125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.901517105102539,
|
||
|
|
"epoch": 0.346985927326192,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004992199883585816,
|
||
|
|
"loss": 5.7557,
|
||
|
|
"mean_token_accuracy": 0.15382387340068818,
|
||
|
|
"num_tokens": 7609191.0,
|
||
|
|
"step": 4130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.910360288619995,
|
||
|
|
"epoch": 0.34740600714135683,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004992174949675413,
|
||
|
|
"loss": 5.7894,
|
||
|
|
"mean_token_accuracy": 0.152114437520504,
|
||
|
|
"num_tokens": 7618509.0,
|
||
|
|
"step": 4135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890322875976563,
|
||
|
|
"epoch": 0.34782608695652173,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004992149976046037,
|
||
|
|
"loss": 5.7136,
|
||
|
|
"mean_token_accuracy": 0.15040391087532043,
|
||
|
|
"num_tokens": 7627851.0,
|
||
|
|
"step": 4140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.837684154510498,
|
||
|
|
"epoch": 0.3482461667716866,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004992124962698128,
|
||
|
|
"loss": 5.7584,
|
||
|
|
"mean_token_accuracy": 0.15606331154704095,
|
||
|
|
"num_tokens": 7636748.0,
|
||
|
|
"step": 4145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.921899652481079,
|
||
|
|
"epoch": 0.3486662465868515,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.000499209990963213,
|
||
|
|
"loss": 5.7078,
|
||
|
|
"mean_token_accuracy": 0.15208663642406464,
|
||
|
|
"num_tokens": 7645436.0,
|
||
|
|
"step": 4150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.917012548446655,
|
||
|
|
"epoch": 0.34908632640201637,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004992074816848487,
|
||
|
|
"loss": 5.8094,
|
||
|
|
"mean_token_accuracy": 0.15278877168893815,
|
||
|
|
"num_tokens": 7655414.0,
|
||
|
|
"step": 4155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.772976493835449,
|
||
|
|
"epoch": 0.34950640621718126,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004992049684347642,
|
||
|
|
"loss": 5.6074,
|
||
|
|
"mean_token_accuracy": 0.15534141510725022,
|
||
|
|
"num_tokens": 7664295.0,
|
||
|
|
"step": 4160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.917826843261719,
|
||
|
|
"epoch": 0.34992648603234616,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004992024512130042,
|
||
|
|
"loss": 5.7416,
|
||
|
|
"mean_token_accuracy": 0.15260617434978485,
|
||
|
|
"num_tokens": 7673295.0,
|
||
|
|
"step": 4165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.788580131530762,
|
||
|
|
"epoch": 0.350346565847511,
|
||
|
|
"grad_norm": 0.859375,
|
||
|
|
"learning_rate": 0.0004991999300196132,
|
||
|
|
"loss": 5.7469,
|
||
|
|
"mean_token_accuracy": 0.15305035635828973,
|
||
|
|
"num_tokens": 7682932.0,
|
||
|
|
"step": 4170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.923834562301636,
|
||
|
|
"epoch": 0.3507666456626759,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004991974048546359,
|
||
|
|
"loss": 5.753,
|
||
|
|
"mean_token_accuracy": 0.1500132530927658,
|
||
|
|
"num_tokens": 7692105.0,
|
||
|
|
"step": 4175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.925296068191528,
|
||
|
|
"epoch": 0.3511867254778408,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.000499194875718117,
|
||
|
|
"loss": 5.7511,
|
||
|
|
"mean_token_accuracy": 0.15551865100860596,
|
||
|
|
"num_tokens": 7701294.0,
|
||
|
|
"step": 4180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.861107254028321,
|
||
|
|
"epoch": 0.3516068052930057,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004991923426101013,
|
||
|
|
"loss": 5.7386,
|
||
|
|
"mean_token_accuracy": 0.14845747649669647,
|
||
|
|
"num_tokens": 7710964.0,
|
||
|
|
"step": 4185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.949919605255127,
|
||
|
|
"epoch": 0.35202688510817054,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004991898055306337,
|
||
|
|
"loss": 5.8577,
|
||
|
|
"mean_token_accuracy": 0.14658492356538771,
|
||
|
|
"num_tokens": 7719938.0,
|
||
|
|
"step": 4190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.951687955856324,
|
||
|
|
"epoch": 0.35244696492333544,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004991872644797591,
|
||
|
|
"loss": 5.7808,
|
||
|
|
"mean_token_accuracy": 0.15141311138868332,
|
||
|
|
"num_tokens": 7729129.0,
|
||
|
|
"step": 4195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.855287361145019,
|
||
|
|
"epoch": 0.35286704473850034,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004991847194575226,
|
||
|
|
"loss": 5.7901,
|
||
|
|
"mean_token_accuracy": 0.14619968980550765,
|
||
|
|
"num_tokens": 7738506.0,
|
||
|
|
"step": 4200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.942954778671265,
|
||
|
|
"epoch": 0.3532871245536652,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004991821704639693,
|
||
|
|
"loss": 5.8959,
|
||
|
|
"mean_token_accuracy": 0.14654064998030664,
|
||
|
|
"num_tokens": 7749320.0,
|
||
|
|
"step": 4205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 6.01116943359375,
|
||
|
|
"epoch": 0.3537072043688301,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004991796174991443,
|
||
|
|
"loss": 5.7415,
|
||
|
|
"mean_token_accuracy": 0.1537883497774601,
|
||
|
|
"num_tokens": 7758735.0,
|
||
|
|
"step": 4210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.822880458831787,
|
||
|
|
"epoch": 0.354127284183995,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004991770605630927,
|
||
|
|
"loss": 5.7132,
|
||
|
|
"mean_token_accuracy": 0.15271057039499283,
|
||
|
|
"num_tokens": 7767556.0,
|
||
|
|
"step": 4215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818714237213134,
|
||
|
|
"epoch": 0.3545473639991598,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004991744996558599,
|
||
|
|
"loss": 5.7336,
|
||
|
|
"mean_token_accuracy": 0.15282744243741037,
|
||
|
|
"num_tokens": 7776615.0,
|
||
|
|
"step": 4220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.915001726150512,
|
||
|
|
"epoch": 0.3549674438143247,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004991719347774913,
|
||
|
|
"loss": 5.7682,
|
||
|
|
"mean_token_accuracy": 0.15577882081270217,
|
||
|
|
"num_tokens": 7785288.0,
|
||
|
|
"step": 4225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918221855163575,
|
||
|
|
"epoch": 0.3553875236294896,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004991693659280324,
|
||
|
|
"loss": 5.6811,
|
||
|
|
"mean_token_accuracy": 0.15442655980587006,
|
||
|
|
"num_tokens": 7794381.0,
|
||
|
|
"step": 4230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.821169424057007,
|
||
|
|
"epoch": 0.3558076034446545,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004991667931075284,
|
||
|
|
"loss": 5.6546,
|
||
|
|
"mean_token_accuracy": 0.15124934762716294,
|
||
|
|
"num_tokens": 7803265.0,
|
||
|
|
"step": 4235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.829122161865234,
|
||
|
|
"epoch": 0.35622768325981935,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004991642163160252,
|
||
|
|
"loss": 5.7671,
|
||
|
|
"mean_token_accuracy": 0.15388772487640381,
|
||
|
|
"num_tokens": 7812445.0,
|
||
|
|
"step": 4240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934730339050293,
|
||
|
|
"epoch": 0.35664776307498425,
|
||
|
|
"grad_norm": 0.87109375,
|
||
|
|
"learning_rate": 0.0004991616355535684,
|
||
|
|
"loss": 5.7542,
|
||
|
|
"mean_token_accuracy": 0.15821312218904496,
|
||
|
|
"num_tokens": 7822073.0,
|
||
|
|
"step": 4245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918817663192749,
|
||
|
|
"epoch": 0.35706784289014915,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004991590508202036,
|
||
|
|
"loss": 5.7264,
|
||
|
|
"mean_token_accuracy": 0.15280235260725022,
|
||
|
|
"num_tokens": 7831193.0,
|
||
|
|
"step": 4250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89573392868042,
|
||
|
|
"epoch": 0.357487922705314,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004991564621159766,
|
||
|
|
"loss": 5.7728,
|
||
|
|
"mean_token_accuracy": 0.15194582045078278,
|
||
|
|
"num_tokens": 7840311.0,
|
||
|
|
"step": 4255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8731294631958,
|
||
|
|
"epoch": 0.3579080025204789,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004991538694409334,
|
||
|
|
"loss": 5.7954,
|
||
|
|
"mean_token_accuracy": 0.14721263125538825,
|
||
|
|
"num_tokens": 7849622.0,
|
||
|
|
"step": 4260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.876342821121216,
|
||
|
|
"epoch": 0.3583280823356438,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004991512727951198,
|
||
|
|
"loss": 5.7558,
|
||
|
|
"mean_token_accuracy": 0.15003474354743956,
|
||
|
|
"num_tokens": 7859494.0,
|
||
|
|
"step": 4265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9838221073150635,
|
||
|
|
"epoch": 0.3587481621508087,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004991486721785818,
|
||
|
|
"loss": 5.8503,
|
||
|
|
"mean_token_accuracy": 0.14846469163894654,
|
||
|
|
"num_tokens": 7868526.0,
|
||
|
|
"step": 4270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.859622812271118,
|
||
|
|
"epoch": 0.3591682419659735,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004991460675913655,
|
||
|
|
"loss": 5.6799,
|
||
|
|
"mean_token_accuracy": 0.1537486046552658,
|
||
|
|
"num_tokens": 7877631.0,
|
||
|
|
"step": 4275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.85202202796936,
|
||
|
|
"epoch": 0.3595883217811384,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.000499143459033517,
|
||
|
|
"loss": 5.7338,
|
||
|
|
"mean_token_accuracy": 0.15869542211294174,
|
||
|
|
"num_tokens": 7886814.0,
|
||
|
|
"step": 4280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.794212818145752,
|
||
|
|
"epoch": 0.3600084015963033,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004991408465050825,
|
||
|
|
"loss": 5.5727,
|
||
|
|
"mean_token_accuracy": 0.1595866084098816,
|
||
|
|
"num_tokens": 7896337.0,
|
||
|
|
"step": 4285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.852896070480346,
|
||
|
|
"epoch": 0.36042848141146816,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004991382300061084,
|
||
|
|
"loss": 5.8163,
|
||
|
|
"mean_token_accuracy": 0.14354490041732787,
|
||
|
|
"num_tokens": 7906071.0,
|
||
|
|
"step": 4290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.937732839584351,
|
||
|
|
"epoch": 0.36084856122663306,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004991356095366409,
|
||
|
|
"loss": 5.8111,
|
||
|
|
"mean_token_accuracy": 0.14974057525396348,
|
||
|
|
"num_tokens": 7915003.0,
|
||
|
|
"step": 4295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.904038953781128,
|
||
|
|
"epoch": 0.36126864104179796,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004991329850967266,
|
||
|
|
"loss": 5.6791,
|
||
|
|
"mean_token_accuracy": 0.15475230365991594,
|
||
|
|
"num_tokens": 7924408.0,
|
||
|
|
"step": 4300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8507331848144535,
|
||
|
|
"epoch": 0.3616887208569628,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004991303566864118,
|
||
|
|
"loss": 5.637,
|
||
|
|
"mean_token_accuracy": 0.1542945459485054,
|
||
|
|
"num_tokens": 7934717.0,
|
||
|
|
"step": 4305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7739667892456055,
|
||
|
|
"epoch": 0.3621088006721277,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004991277243057431,
|
||
|
|
"loss": 5.7101,
|
||
|
|
"mean_token_accuracy": 0.1505005143582821,
|
||
|
|
"num_tokens": 7944278.0,
|
||
|
|
"step": 4310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.808600950241089,
|
||
|
|
"epoch": 0.3625288804872926,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004991250879547673,
|
||
|
|
"loss": 5.7235,
|
||
|
|
"mean_token_accuracy": 0.1538018502295017,
|
||
|
|
"num_tokens": 7953344.0,
|
||
|
|
"step": 4315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.829892158508301,
|
||
|
|
"epoch": 0.3629489603024575,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004991224476335309,
|
||
|
|
"loss": 5.7448,
|
||
|
|
"mean_token_accuracy": 0.149826068431139,
|
||
|
|
"num_tokens": 7962869.0,
|
||
|
|
"step": 4320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.963926601409912,
|
||
|
|
"epoch": 0.36336904011762233,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004991198033420807,
|
||
|
|
"loss": 5.7344,
|
||
|
|
"mean_token_accuracy": 0.15306216776371,
|
||
|
|
"num_tokens": 7971981.0,
|
||
|
|
"step": 4325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.884770917892456,
|
||
|
|
"epoch": 0.36378911993278723,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004991171550804636,
|
||
|
|
"loss": 5.7019,
|
||
|
|
"mean_token_accuracy": 0.15474960654973985,
|
||
|
|
"num_tokens": 7980979.0,
|
||
|
|
"step": 4330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.863976860046387,
|
||
|
|
"epoch": 0.36420919974795213,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004991145028487266,
|
||
|
|
"loss": 5.7748,
|
||
|
|
"mean_token_accuracy": 0.1529791235923767,
|
||
|
|
"num_tokens": 7989607.0,
|
||
|
|
"step": 4335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7957190990448,
|
||
|
|
"epoch": 0.36462927956311697,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004991118466469165,
|
||
|
|
"loss": 5.5897,
|
||
|
|
"mean_token_accuracy": 0.1639975592494011,
|
||
|
|
"num_tokens": 7998356.0,
|
||
|
|
"step": 4340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.849919033050537,
|
||
|
|
"epoch": 0.36504935937828187,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004991091864750805,
|
||
|
|
"loss": 5.7033,
|
||
|
|
"mean_token_accuracy": 0.1553362563252449,
|
||
|
|
"num_tokens": 8007596.0,
|
||
|
|
"step": 4345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.909917688369751,
|
||
|
|
"epoch": 0.36546943919344677,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004991065223332655,
|
||
|
|
"loss": 5.7587,
|
||
|
|
"mean_token_accuracy": 0.15085091739892958,
|
||
|
|
"num_tokens": 8016493.0,
|
||
|
|
"step": 4350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.884606981277466,
|
||
|
|
"epoch": 0.36588951900861166,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004991038542215191,
|
||
|
|
"loss": 5.7272,
|
||
|
|
"mean_token_accuracy": 0.1481338232755661,
|
||
|
|
"num_tokens": 8025867.0,
|
||
|
|
"step": 4355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814969539642334,
|
||
|
|
"epoch": 0.3663095988237765,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004991011821398882,
|
||
|
|
"loss": 5.7464,
|
||
|
|
"mean_token_accuracy": 0.15548805743455887,
|
||
|
|
"num_tokens": 8036251.0,
|
||
|
|
"step": 4360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.905033826828003,
|
||
|
|
"epoch": 0.3667296786389414,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004990985060884202,
|
||
|
|
"loss": 5.7024,
|
||
|
|
"mean_token_accuracy": 0.1582213595509529,
|
||
|
|
"num_tokens": 8045647.0,
|
||
|
|
"step": 4365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.88990044593811,
|
||
|
|
"epoch": 0.3671497584541063,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004990958260671627,
|
||
|
|
"loss": 5.79,
|
||
|
|
"mean_token_accuracy": 0.1454270862042904,
|
||
|
|
"num_tokens": 8056025.0,
|
||
|
|
"step": 4370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809770679473877,
|
||
|
|
"epoch": 0.36756983826927114,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004990931420761629,
|
||
|
|
"loss": 5.7083,
|
||
|
|
"mean_token_accuracy": 0.16103482097387314,
|
||
|
|
"num_tokens": 8065029.0,
|
||
|
|
"step": 4375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.914457368850708,
|
||
|
|
"epoch": 0.36798991808443604,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004990904541154685,
|
||
|
|
"loss": 5.6763,
|
||
|
|
"mean_token_accuracy": 0.16559941172599793,
|
||
|
|
"num_tokens": 8073249.0,
|
||
|
|
"step": 4380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.894069719314575,
|
||
|
|
"epoch": 0.36840999789960094,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004990877621851271,
|
||
|
|
"loss": 5.8002,
|
||
|
|
"mean_token_accuracy": 0.153408020734787,
|
||
|
|
"num_tokens": 8082039.0,
|
||
|
|
"step": 4385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8033387660980225,
|
||
|
|
"epoch": 0.3688300777147658,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.0004990850662851863,
|
||
|
|
"loss": 5.6375,
|
||
|
|
"mean_token_accuracy": 0.15707656592130662,
|
||
|
|
"num_tokens": 8090011.0,
|
||
|
|
"step": 4390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.879843854904175,
|
||
|
|
"epoch": 0.3692501575299307,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004990823664156941,
|
||
|
|
"loss": 5.7455,
|
||
|
|
"mean_token_accuracy": 0.1648575708270073,
|
||
|
|
"num_tokens": 8099934.0,
|
||
|
|
"step": 4395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.963798093795776,
|
||
|
|
"epoch": 0.3696702373450956,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004990796625766981,
|
||
|
|
"loss": 5.7681,
|
||
|
|
"mean_token_accuracy": 0.14946894496679305,
|
||
|
|
"num_tokens": 8108969.0,
|
||
|
|
"step": 4400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.835124111175537,
|
||
|
|
"epoch": 0.3700903171602605,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004990769547682462,
|
||
|
|
"loss": 5.6935,
|
||
|
|
"mean_token_accuracy": 0.15169232487678527,
|
||
|
|
"num_tokens": 8117372.0,
|
||
|
|
"step": 4405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.979207563400268,
|
||
|
|
"epoch": 0.3705103969754253,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004990742429903866,
|
||
|
|
"loss": 5.8757,
|
||
|
|
"mean_token_accuracy": 0.14571133852005005,
|
||
|
|
"num_tokens": 8127108.0,
|
||
|
|
"step": 4410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.961515951156616,
|
||
|
|
"epoch": 0.3709304767905902,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.000499071527243167,
|
||
|
|
"loss": 5.8507,
|
||
|
|
"mean_token_accuracy": 0.14516980648040773,
|
||
|
|
"num_tokens": 8137392.0,
|
||
|
|
"step": 4415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.880073976516724,
|
||
|
|
"epoch": 0.3713505566057551,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004990688075266357,
|
||
|
|
"loss": 5.7019,
|
||
|
|
"mean_token_accuracy": 0.15986401289701463,
|
||
|
|
"num_tokens": 8146257.0,
|
||
|
|
"step": 4420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.805649709701538,
|
||
|
|
"epoch": 0.37177063642091995,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004990660838408409,
|
||
|
|
"loss": 5.6521,
|
||
|
|
"mean_token_accuracy": 0.15721987932920456,
|
||
|
|
"num_tokens": 8154952.0,
|
||
|
|
"step": 4425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.893301391601563,
|
||
|
|
"epoch": 0.37219071623608485,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004990633561858308,
|
||
|
|
"loss": 5.7106,
|
||
|
|
"mean_token_accuracy": 0.14765800014138222,
|
||
|
|
"num_tokens": 8164365.0,
|
||
|
|
"step": 4430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.924961233139038,
|
||
|
|
"epoch": 0.37261079605124975,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004990606245616537,
|
||
|
|
"loss": 5.7205,
|
||
|
|
"mean_token_accuracy": 0.15445269271731377,
|
||
|
|
"num_tokens": 8172614.0,
|
||
|
|
"step": 4435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.877901983261109,
|
||
|
|
"epoch": 0.37303087586641465,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004990578889683579,
|
||
|
|
"loss": 5.7888,
|
||
|
|
"mean_token_accuracy": 0.150545197725296,
|
||
|
|
"num_tokens": 8182445.0,
|
||
|
|
"step": 4440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.885668134689331,
|
||
|
|
"epoch": 0.3734509556815795,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004990551494059921,
|
||
|
|
"loss": 5.6613,
|
||
|
|
"mean_token_accuracy": 0.15747766494750975,
|
||
|
|
"num_tokens": 8191871.0,
|
||
|
|
"step": 4445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.893858480453491,
|
||
|
|
"epoch": 0.3738710354967444,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004990524058746047,
|
||
|
|
"loss": 5.8285,
|
||
|
|
"mean_token_accuracy": 0.15561486929655075,
|
||
|
|
"num_tokens": 8200658.0,
|
||
|
|
"step": 4450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.879518842697143,
|
||
|
|
"epoch": 0.3742911153119093,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004990496583742443,
|
||
|
|
"loss": 5.7547,
|
||
|
|
"mean_token_accuracy": 0.15101703256368637,
|
||
|
|
"num_tokens": 8209776.0,
|
||
|
|
"step": 4455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.868221950531006,
|
||
|
|
"epoch": 0.3747111951270741,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004990469069049596,
|
||
|
|
"loss": 5.6747,
|
||
|
|
"mean_token_accuracy": 0.15401403456926346,
|
||
|
|
"num_tokens": 8219401.0,
|
||
|
|
"step": 4460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809508180618286,
|
||
|
|
"epoch": 0.375131274942239,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004990441514667993,
|
||
|
|
"loss": 5.7095,
|
||
|
|
"mean_token_accuracy": 0.15698247104883195,
|
||
|
|
"num_tokens": 8228762.0,
|
||
|
|
"step": 4465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.932300424575805,
|
||
|
|
"epoch": 0.3755513547574039,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004990413920598121,
|
||
|
|
"loss": 5.7223,
|
||
|
|
"mean_token_accuracy": 0.15662275701761247,
|
||
|
|
"num_tokens": 8236612.0,
|
||
|
|
"step": 4470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.896757698059082,
|
||
|
|
"epoch": 0.37597143457256876,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004990386286840471,
|
||
|
|
"loss": 5.7335,
|
||
|
|
"mean_token_accuracy": 0.15207386016845703,
|
||
|
|
"num_tokens": 8245043.0,
|
||
|
|
"step": 4475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.995736980438233,
|
||
|
|
"epoch": 0.37639151438773366,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004990358613395532,
|
||
|
|
"loss": 5.8307,
|
||
|
|
"mean_token_accuracy": 0.15044568330049515,
|
||
|
|
"num_tokens": 8255270.0,
|
||
|
|
"step": 4480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.938156318664551,
|
||
|
|
"epoch": 0.37681159420289856,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004990330900263792,
|
||
|
|
"loss": 5.7971,
|
||
|
|
"mean_token_accuracy": 0.14653817862272261,
|
||
|
|
"num_tokens": 8264761.0,
|
||
|
|
"step": 4485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8954840183258055,
|
||
|
|
"epoch": 0.37723167401806346,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004990303147445745,
|
||
|
|
"loss": 5.7454,
|
||
|
|
"mean_token_accuracy": 0.15479619354009627,
|
||
|
|
"num_tokens": 8274308.0,
|
||
|
|
"step": 4490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.815971899032593,
|
||
|
|
"epoch": 0.3776517538332283,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004990275354941881,
|
||
|
|
"loss": 5.6288,
|
||
|
|
"mean_token_accuracy": 0.1646218091249466,
|
||
|
|
"num_tokens": 8283323.0,
|
||
|
|
"step": 4495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.893220853805542,
|
||
|
|
"epoch": 0.3780718336483932,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004990247522752694,
|
||
|
|
"loss": 5.9629,
|
||
|
|
"mean_token_accuracy": 0.14029839560389518,
|
||
|
|
"num_tokens": 8293452.0,
|
||
|
|
"step": 4500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.897252893447876,
|
||
|
|
"epoch": 0.3784919134635581,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004990219650878674,
|
||
|
|
"loss": 5.6576,
|
||
|
|
"mean_token_accuracy": 0.16113524734973908,
|
||
|
|
"num_tokens": 8302941.0,
|
||
|
|
"step": 4505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.781876134872436,
|
||
|
|
"epoch": 0.37891199327872294,
|
||
|
|
"grad_norm": 1.4140625,
|
||
|
|
"learning_rate": 0.0004990191739320318,
|
||
|
|
"loss": 5.6671,
|
||
|
|
"mean_token_accuracy": 0.1652265876531601,
|
||
|
|
"num_tokens": 8311811.0,
|
||
|
|
"step": 4510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.76027250289917,
|
||
|
|
"epoch": 0.37933207309388783,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004990163788078117,
|
||
|
|
"loss": 5.5692,
|
||
|
|
"mean_token_accuracy": 0.15842368602752685,
|
||
|
|
"num_tokens": 8321130.0,
|
||
|
|
"step": 4515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.842820358276367,
|
||
|
|
"epoch": 0.37975215290905273,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004990135797152569,
|
||
|
|
"loss": 5.6768,
|
||
|
|
"mean_token_accuracy": 0.15367345213890077,
|
||
|
|
"num_tokens": 8330233.0,
|
||
|
|
"step": 4520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.770590019226074,
|
||
|
|
"epoch": 0.3801722327242176,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004990107766544169,
|
||
|
|
"loss": 5.6599,
|
||
|
|
"mean_token_accuracy": 0.16070746779441833,
|
||
|
|
"num_tokens": 8338585.0,
|
||
|
|
"step": 4525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.844082069396973,
|
||
|
|
"epoch": 0.38059231253938247,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004990079696253413,
|
||
|
|
"loss": 5.7068,
|
||
|
|
"mean_token_accuracy": 0.15848116278648378,
|
||
|
|
"num_tokens": 8346618.0,
|
||
|
|
"step": 4530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.902699136734009,
|
||
|
|
"epoch": 0.38101239235454737,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004990051586280799,
|
||
|
|
"loss": 5.6829,
|
||
|
|
"mean_token_accuracy": 0.15385363698005677,
|
||
|
|
"num_tokens": 8356273.0,
|
||
|
|
"step": 4535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.847843742370605,
|
||
|
|
"epoch": 0.38143247216971227,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004990023436626824,
|
||
|
|
"loss": 5.674,
|
||
|
|
"mean_token_accuracy": 0.15799472630023956,
|
||
|
|
"num_tokens": 8366668.0,
|
||
|
|
"step": 4540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.954341840744019,
|
||
|
|
"epoch": 0.3818525519848771,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004989995247291988,
|
||
|
|
"loss": 5.7933,
|
||
|
|
"mean_token_accuracy": 0.15496921986341478,
|
||
|
|
"num_tokens": 8375610.0,
|
||
|
|
"step": 4545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.860501337051391,
|
||
|
|
"epoch": 0.382272631800042,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004989967018276789,
|
||
|
|
"loss": 5.6729,
|
||
|
|
"mean_token_accuracy": 0.1558580845594406,
|
||
|
|
"num_tokens": 8384455.0,
|
||
|
|
"step": 4550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7317808151245115,
|
||
|
|
"epoch": 0.3826927116152069,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004989938749581727,
|
||
|
|
"loss": 5.7105,
|
||
|
|
"mean_token_accuracy": 0.14987761974334718,
|
||
|
|
"num_tokens": 8393868.0,
|
||
|
|
"step": 4555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8618772506713865,
|
||
|
|
"epoch": 0.38311279143037175,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004989910441207305,
|
||
|
|
"loss": 5.7312,
|
||
|
|
"mean_token_accuracy": 0.15411882251501083,
|
||
|
|
"num_tokens": 8402916.0,
|
||
|
|
"step": 4560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830321025848389,
|
||
|
|
"epoch": 0.38353287124553664,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004989882093154023,
|
||
|
|
"loss": 5.6485,
|
||
|
|
"mean_token_accuracy": 0.1575123891234398,
|
||
|
|
"num_tokens": 8411649.0,
|
||
|
|
"step": 4565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8616162776947025,
|
||
|
|
"epoch": 0.38395295106070154,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004989853705422381,
|
||
|
|
"loss": 5.769,
|
||
|
|
"mean_token_accuracy": 0.14645260721445083,
|
||
|
|
"num_tokens": 8420393.0,
|
||
|
|
"step": 4570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.813478136062622,
|
||
|
|
"epoch": 0.38437303087586644,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004989825278012886,
|
||
|
|
"loss": 5.6629,
|
||
|
|
"mean_token_accuracy": 0.154879230260849,
|
||
|
|
"num_tokens": 8429404.0,
|
||
|
|
"step": 4575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.851570463180542,
|
||
|
|
"epoch": 0.3847931106910313,
|
||
|
|
"grad_norm": 1.078125,
|
||
|
|
"learning_rate": 0.000498979681092604,
|
||
|
|
"loss": 5.703,
|
||
|
|
"mean_token_accuracy": 0.149764809012413,
|
||
|
|
"num_tokens": 8438299.0,
|
||
|
|
"step": 4580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.760462951660156,
|
||
|
|
"epoch": 0.3852131905061962,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004989768304162345,
|
||
|
|
"loss": 5.6615,
|
||
|
|
"mean_token_accuracy": 0.15541962534189224,
|
||
|
|
"num_tokens": 8447392.0,
|
||
|
|
"step": 4585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89559907913208,
|
||
|
|
"epoch": 0.3856332703213611,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004989739757722308,
|
||
|
|
"loss": 5.7474,
|
||
|
|
"mean_token_accuracy": 0.14751126170158385,
|
||
|
|
"num_tokens": 8456361.0,
|
||
|
|
"step": 4590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.852615118026733,
|
||
|
|
"epoch": 0.3860533501365259,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004989711171606436,
|
||
|
|
"loss": 5.6747,
|
||
|
|
"mean_token_accuracy": 0.15710035860538482,
|
||
|
|
"num_tokens": 8465548.0,
|
||
|
|
"step": 4595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.885403347015381,
|
||
|
|
"epoch": 0.3864734299516908,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004989682545815232,
|
||
|
|
"loss": 5.6869,
|
||
|
|
"mean_token_accuracy": 0.1525876745581627,
|
||
|
|
"num_tokens": 8474454.0,
|
||
|
|
"step": 4600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8074538230896,
|
||
|
|
"epoch": 0.3868935097668557,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004989653880349207,
|
||
|
|
"loss": 5.6074,
|
||
|
|
"mean_token_accuracy": 0.1573283538222313,
|
||
|
|
"num_tokens": 8482694.0,
|
||
|
|
"step": 4605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.842355585098266,
|
||
|
|
"epoch": 0.38731358958202056,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004989625175208864,
|
||
|
|
"loss": 5.7257,
|
||
|
|
"mean_token_accuracy": 0.15177675783634187,
|
||
|
|
"num_tokens": 8491162.0,
|
||
|
|
"step": 4610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.787636756896973,
|
||
|
|
"epoch": 0.38773366939718545,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004989596430394717,
|
||
|
|
"loss": 5.5752,
|
||
|
|
"mean_token_accuracy": 0.17091956436634065,
|
||
|
|
"num_tokens": 8500716.0,
|
||
|
|
"step": 4615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7534934997558596,
|
||
|
|
"epoch": 0.38815374921235035,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000498956764590727,
|
||
|
|
"loss": 5.6231,
|
||
|
|
"mean_token_accuracy": 0.1520329423248768,
|
||
|
|
"num_tokens": 8508871.0,
|
||
|
|
"step": 4620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890595149993897,
|
||
|
|
"epoch": 0.38857382902751525,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004989538821747037,
|
||
|
|
"loss": 5.8315,
|
||
|
|
"mean_token_accuracy": 0.15000174939632416,
|
||
|
|
"num_tokens": 8518450.0,
|
||
|
|
"step": 4625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.941072607040406,
|
||
|
|
"epoch": 0.3889939088426801,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004989509957914527,
|
||
|
|
"loss": 5.7284,
|
||
|
|
"mean_token_accuracy": 0.15086407959461212,
|
||
|
|
"num_tokens": 8528238.0,
|
||
|
|
"step": 4630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.794663810729981,
|
||
|
|
"epoch": 0.389413988657845,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004989481054410251,
|
||
|
|
"loss": 5.6258,
|
||
|
|
"mean_token_accuracy": 0.1528220996260643,
|
||
|
|
"num_tokens": 8537587.0,
|
||
|
|
"step": 4635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.795312023162841,
|
||
|
|
"epoch": 0.3898340684730099,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004989452111234721,
|
||
|
|
"loss": 5.7462,
|
||
|
|
"mean_token_accuracy": 0.1528109699487686,
|
||
|
|
"num_tokens": 8547703.0,
|
||
|
|
"step": 4640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.84535961151123,
|
||
|
|
"epoch": 0.39025414828817473,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.000498942312838845,
|
||
|
|
"loss": 5.6766,
|
||
|
|
"mean_token_accuracy": 0.1572122886776924,
|
||
|
|
"num_tokens": 8557001.0,
|
||
|
|
"step": 4645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796119689941406,
|
||
|
|
"epoch": 0.3906742281033396,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004989394105871952,
|
||
|
|
"loss": 5.5616,
|
||
|
|
"mean_token_accuracy": 0.16711176037788392,
|
||
|
|
"num_tokens": 8565638.0,
|
||
|
|
"step": 4650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.91137285232544,
|
||
|
|
"epoch": 0.3910943079185045,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000498936504368574,
|
||
|
|
"loss": 5.7305,
|
||
|
|
"mean_token_accuracy": 0.15593890845775604,
|
||
|
|
"num_tokens": 8574428.0,
|
||
|
|
"step": 4655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.800365591049195,
|
||
|
|
"epoch": 0.3915143877336694,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004989335941830329,
|
||
|
|
"loss": 5.684,
|
||
|
|
"mean_token_accuracy": 0.15439117401838304,
|
||
|
|
"num_tokens": 8583157.0,
|
||
|
|
"step": 4660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.817437553405762,
|
||
|
|
"epoch": 0.39193446754883426,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004989306800306236,
|
||
|
|
"loss": 5.6621,
|
||
|
|
"mean_token_accuracy": 0.149759341776371,
|
||
|
|
"num_tokens": 8592382.0,
|
||
|
|
"step": 4665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7860520362854,
|
||
|
|
"epoch": 0.39235454736399916,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004989277619113975,
|
||
|
|
"loss": 5.6345,
|
||
|
|
"mean_token_accuracy": 0.16216987669467925,
|
||
|
|
"num_tokens": 8601058.0,
|
||
|
|
"step": 4670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.875742197036743,
|
||
|
|
"epoch": 0.39277462717916406,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004989248398254065,
|
||
|
|
"loss": 5.7352,
|
||
|
|
"mean_token_accuracy": 0.15142691284418106,
|
||
|
|
"num_tokens": 8609479.0,
|
||
|
|
"step": 4675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.859423112869263,
|
||
|
|
"epoch": 0.3931947069943289,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004989219137727021,
|
||
|
|
"loss": 5.7036,
|
||
|
|
"mean_token_accuracy": 0.15549542009830475,
|
||
|
|
"num_tokens": 8618860.0,
|
||
|
|
"step": 4680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.81779637336731,
|
||
|
|
"epoch": 0.3936147868094938,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004989189837533365,
|
||
|
|
"loss": 5.6363,
|
||
|
|
"mean_token_accuracy": 0.1587088495492935,
|
||
|
|
"num_tokens": 8627462.0,
|
||
|
|
"step": 4685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.924579429626465,
|
||
|
|
"epoch": 0.3940348666246587,
|
||
|
|
"grad_norm": 0.83203125,
|
||
|
|
"learning_rate": 0.0004989160497673613,
|
||
|
|
"loss": 5.8254,
|
||
|
|
"mean_token_accuracy": 0.1513897880911827,
|
||
|
|
"num_tokens": 8637569.0,
|
||
|
|
"step": 4690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.850678825378418,
|
||
|
|
"epoch": 0.39445494643982354,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004989131118148286,
|
||
|
|
"loss": 5.6177,
|
||
|
|
"mean_token_accuracy": 0.15605207085609435,
|
||
|
|
"num_tokens": 8645440.0,
|
||
|
|
"step": 4695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.835308980941773,
|
||
|
|
"epoch": 0.39487502625498844,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004989101698957904,
|
||
|
|
"loss": 5.7682,
|
||
|
|
"mean_token_accuracy": 0.15626595616340638,
|
||
|
|
"num_tokens": 8655077.0,
|
||
|
|
"step": 4700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830049610137939,
|
||
|
|
"epoch": 0.39529510607015333,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004989072240102988,
|
||
|
|
"loss": 5.6957,
|
||
|
|
"mean_token_accuracy": 0.16012858897447585,
|
||
|
|
"num_tokens": 8663126.0,
|
||
|
|
"step": 4705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.901100158691406,
|
||
|
|
"epoch": 0.39571518588531823,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004989042741584061,
|
||
|
|
"loss": 5.6726,
|
||
|
|
"mean_token_accuracy": 0.15270041525363923,
|
||
|
|
"num_tokens": 8672386.0,
|
||
|
|
"step": 4710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7314942359924315,
|
||
|
|
"epoch": 0.3961352657004831,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004989013203401645,
|
||
|
|
"loss": 5.612,
|
||
|
|
"mean_token_accuracy": 0.1580759972333908,
|
||
|
|
"num_tokens": 8681930.0,
|
||
|
|
"step": 4715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.797902965545655,
|
||
|
|
"epoch": 0.396555345515648,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004988983625556264,
|
||
|
|
"loss": 5.6787,
|
||
|
|
"mean_token_accuracy": 0.15581901967525483,
|
||
|
|
"num_tokens": 8690993.0,
|
||
|
|
"step": 4720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.798060894012451,
|
||
|
|
"epoch": 0.39697542533081287,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004988954008048438,
|
||
|
|
"loss": 5.672,
|
||
|
|
"mean_token_accuracy": 0.15935962349176408,
|
||
|
|
"num_tokens": 8699497.0,
|
||
|
|
"step": 4725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.933620643615723,
|
||
|
|
"epoch": 0.3973955051459777,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004988924350878697,
|
||
|
|
"loss": 5.8568,
|
||
|
|
"mean_token_accuracy": 0.14457278251647948,
|
||
|
|
"num_tokens": 8709274.0,
|
||
|
|
"step": 4730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.934816789627075,
|
||
|
|
"epoch": 0.3978155849611426,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004988894654047563,
|
||
|
|
"loss": 5.7297,
|
||
|
|
"mean_token_accuracy": 0.15009873509407043,
|
||
|
|
"num_tokens": 8718158.0,
|
||
|
|
"step": 4735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.786411237716675,
|
||
|
|
"epoch": 0.3982356647763075,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004988864917555562,
|
||
|
|
"loss": 5.5866,
|
||
|
|
"mean_token_accuracy": 0.15930677056312562,
|
||
|
|
"num_tokens": 8727459.0,
|
||
|
|
"step": 4740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.864226961135865,
|
||
|
|
"epoch": 0.3986557445914724,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004988835141403224,
|
||
|
|
"loss": 5.7293,
|
||
|
|
"mean_token_accuracy": 0.15878916680812835,
|
||
|
|
"num_tokens": 8737614.0,
|
||
|
|
"step": 4745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.824589109420776,
|
||
|
|
"epoch": 0.39907582440663725,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004988805325591073,
|
||
|
|
"loss": 5.56,
|
||
|
|
"mean_token_accuracy": 0.15695197582244874,
|
||
|
|
"num_tokens": 8746799.0,
|
||
|
|
"step": 4750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8385083198547365,
|
||
|
|
"epoch": 0.39949590422180214,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004988775470119639,
|
||
|
|
"loss": 5.7326,
|
||
|
|
"mean_token_accuracy": 0.14953183978796006,
|
||
|
|
"num_tokens": 8756555.0,
|
||
|
|
"step": 4755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7729175090789795,
|
||
|
|
"epoch": 0.39991598403696704,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004988745574989451,
|
||
|
|
"loss": 5.7535,
|
||
|
|
"mean_token_accuracy": 0.15938151776790618,
|
||
|
|
"num_tokens": 8765849.0,
|
||
|
|
"step": 4760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.965050411224365,
|
||
|
|
"epoch": 0.4003360638521319,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004988715640201036,
|
||
|
|
"loss": 5.8322,
|
||
|
|
"mean_token_accuracy": 0.14530889242887496,
|
||
|
|
"num_tokens": 8775713.0,
|
||
|
|
"step": 4765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.839820480346679,
|
||
|
|
"epoch": 0.4007561436672968,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004988685665754928,
|
||
|
|
"loss": 5.6466,
|
||
|
|
"mean_token_accuracy": 0.1569948598742485,
|
||
|
|
"num_tokens": 8784717.0,
|
||
|
|
"step": 4770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.792028474807739,
|
||
|
|
"epoch": 0.4011762234824617,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004988655651651656,
|
||
|
|
"loss": 5.6649,
|
||
|
|
"mean_token_accuracy": 0.15628512352705,
|
||
|
|
"num_tokens": 8794388.0,
|
||
|
|
"step": 4775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.755618572235107,
|
||
|
|
"epoch": 0.4015963032976265,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004988625597891751,
|
||
|
|
"loss": 5.6762,
|
||
|
|
"mean_token_accuracy": 0.15925197303295135,
|
||
|
|
"num_tokens": 8802436.0,
|
||
|
|
"step": 4780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.85797004699707,
|
||
|
|
"epoch": 0.4020163831127914,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004988595504475746,
|
||
|
|
"loss": 5.6376,
|
||
|
|
"mean_token_accuracy": 0.15845684409141542,
|
||
|
|
"num_tokens": 8811184.0,
|
||
|
|
"step": 4785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.920813274383545,
|
||
|
|
"epoch": 0.4024364629279563,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004988565371404175,
|
||
|
|
"loss": 5.7115,
|
||
|
|
"mean_token_accuracy": 0.15826244726777078,
|
||
|
|
"num_tokens": 8820525.0,
|
||
|
|
"step": 4790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.790119886398315,
|
||
|
|
"epoch": 0.4028565427431212,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004988535198677571,
|
||
|
|
"loss": 5.5798,
|
||
|
|
"mean_token_accuracy": 0.16315356642007828,
|
||
|
|
"num_tokens": 8828928.0,
|
||
|
|
"step": 4795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.902295684814453,
|
||
|
|
"epoch": 0.40327662255828606,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004988504986296469,
|
||
|
|
"loss": 5.7884,
|
||
|
|
"mean_token_accuracy": 0.1443356990814209,
|
||
|
|
"num_tokens": 8838615.0,
|
||
|
|
"step": 4800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.862144041061401,
|
||
|
|
"epoch": 0.40369670237345096,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004988474734261404,
|
||
|
|
"loss": 5.769,
|
||
|
|
"mean_token_accuracy": 0.1485462300479412,
|
||
|
|
"num_tokens": 8848709.0,
|
||
|
|
"step": 4805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8929126262664795,
|
||
|
|
"epoch": 0.40411678218861585,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004988444442572911,
|
||
|
|
"loss": 5.7251,
|
||
|
|
"mean_token_accuracy": 0.14630650877952575,
|
||
|
|
"num_tokens": 8858277.0,
|
||
|
|
"step": 4810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814572858810425,
|
||
|
|
"epoch": 0.4045368620037807,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004988414111231528,
|
||
|
|
"loss": 5.6716,
|
||
|
|
"mean_token_accuracy": 0.15942000597715378,
|
||
|
|
"num_tokens": 8868436.0,
|
||
|
|
"step": 4815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8521270751953125,
|
||
|
|
"epoch": 0.4049569418189456,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.000498838374023779,
|
||
|
|
"loss": 5.6738,
|
||
|
|
"mean_token_accuracy": 0.15392234772443772,
|
||
|
|
"num_tokens": 8877740.0,
|
||
|
|
"step": 4820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.896619701385498,
|
||
|
|
"epoch": 0.4053770216341105,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004988353329592239,
|
||
|
|
"loss": 5.6449,
|
||
|
|
"mean_token_accuracy": 0.15986622273921966,
|
||
|
|
"num_tokens": 8887408.0,
|
||
|
|
"step": 4825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.889400386810303,
|
||
|
|
"epoch": 0.4057971014492754,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004988322879295409,
|
||
|
|
"loss": 5.8084,
|
||
|
|
"mean_token_accuracy": 0.151357901096344,
|
||
|
|
"num_tokens": 8897141.0,
|
||
|
|
"step": 4830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.732660865783691,
|
||
|
|
"epoch": 0.40621718126444023,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004988292389347844,
|
||
|
|
"loss": 5.5937,
|
||
|
|
"mean_token_accuracy": 0.16834330409765244,
|
||
|
|
"num_tokens": 8905747.0,
|
||
|
|
"step": 4835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.910235500335693,
|
||
|
|
"epoch": 0.40663726107960513,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.000498826185975008,
|
||
|
|
"loss": 5.7403,
|
||
|
|
"mean_token_accuracy": 0.15051692128181457,
|
||
|
|
"num_tokens": 8914926.0,
|
||
|
|
"step": 4840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.855715417861939,
|
||
|
|
"epoch": 0.40705734089477,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004988231290502662,
|
||
|
|
"loss": 5.7351,
|
||
|
|
"mean_token_accuracy": 0.15608510375022888,
|
||
|
|
"num_tokens": 8923956.0,
|
||
|
|
"step": 4845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.844746065139771,
|
||
|
|
"epoch": 0.40747742070993487,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004988200681606127,
|
||
|
|
"loss": 5.6105,
|
||
|
|
"mean_token_accuracy": 0.15472539961338044,
|
||
|
|
"num_tokens": 8932654.0,
|
||
|
|
"step": 4850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.819759750366211,
|
||
|
|
"epoch": 0.40789750052509977,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.000498817003306102,
|
||
|
|
"loss": 5.602,
|
||
|
|
"mean_token_accuracy": 0.1623125731945038,
|
||
|
|
"num_tokens": 8941716.0,
|
||
|
|
"step": 4855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.776214361190796,
|
||
|
|
"epoch": 0.40831758034026466,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004988139344867884,
|
||
|
|
"loss": 5.6825,
|
||
|
|
"mean_token_accuracy": 0.1535426653921604,
|
||
|
|
"num_tokens": 8950377.0,
|
||
|
|
"step": 4860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807446241378784,
|
||
|
|
"epoch": 0.4087376601554295,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004988108617027261,
|
||
|
|
"loss": 5.6579,
|
||
|
|
"mean_token_accuracy": 0.15453788191080092,
|
||
|
|
"num_tokens": 8959857.0,
|
||
|
|
"step": 4865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.781218004226685,
|
||
|
|
"epoch": 0.4091577399705944,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004988077849539698,
|
||
|
|
"loss": 5.5902,
|
||
|
|
"mean_token_accuracy": 0.15969525128602982,
|
||
|
|
"num_tokens": 8968272.0,
|
||
|
|
"step": 4870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.820656394958496,
|
||
|
|
"epoch": 0.4095778197857593,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004988047042405736,
|
||
|
|
"loss": 5.6674,
|
||
|
|
"mean_token_accuracy": 0.15931978076696396,
|
||
|
|
"num_tokens": 8977445.0,
|
||
|
|
"step": 4875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.915397691726684,
|
||
|
|
"epoch": 0.4099978996009242,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004988016195625924,
|
||
|
|
"loss": 5.7299,
|
||
|
|
"mean_token_accuracy": 0.15139664933085442,
|
||
|
|
"num_tokens": 8987315.0,
|
||
|
|
"step": 4880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.871594667434692,
|
||
|
|
"epoch": 0.41041797941608904,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004987985309200807,
|
||
|
|
"loss": 5.7173,
|
||
|
|
"mean_token_accuracy": 0.15377188473939896,
|
||
|
|
"num_tokens": 8998119.0,
|
||
|
|
"step": 4885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.775591278076172,
|
||
|
|
"epoch": 0.41083805923125394,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004987954383130934,
|
||
|
|
"loss": 5.6066,
|
||
|
|
"mean_token_accuracy": 0.16712582856416702,
|
||
|
|
"num_tokens": 9007167.0,
|
||
|
|
"step": 4890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807595109939575,
|
||
|
|
"epoch": 0.41125813904641884,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.000498792341741685,
|
||
|
|
"loss": 5.6687,
|
||
|
|
"mean_token_accuracy": 0.1526729181408882,
|
||
|
|
"num_tokens": 9016690.0,
|
||
|
|
"step": 4895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.874031114578247,
|
||
|
|
"epoch": 0.4116782188615837,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004987892412059106,
|
||
|
|
"loss": 5.758,
|
||
|
|
"mean_token_accuracy": 0.15407043546438218,
|
||
|
|
"num_tokens": 9026117.0,
|
||
|
|
"step": 4900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.780725193023682,
|
||
|
|
"epoch": 0.4120982986767486,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004987861367058251,
|
||
|
|
"loss": 5.644,
|
||
|
|
"mean_token_accuracy": 0.1559523746371269,
|
||
|
|
"num_tokens": 9035754.0,
|
||
|
|
"step": 4905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826504945755005,
|
||
|
|
"epoch": 0.4125183784919135,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004987830282414833,
|
||
|
|
"loss": 5.642,
|
||
|
|
"mean_token_accuracy": 0.15711333677172662,
|
||
|
|
"num_tokens": 9045453.0,
|
||
|
|
"step": 4910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.873796701431274,
|
||
|
|
"epoch": 0.41293845830707837,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004987799158129404,
|
||
|
|
"loss": 5.7527,
|
||
|
|
"mean_token_accuracy": 0.15677697360515594,
|
||
|
|
"num_tokens": 9056045.0,
|
||
|
|
"step": 4915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.820205545425415,
|
||
|
|
"epoch": 0.4133585381222432,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004987767994202516,
|
||
|
|
"loss": 5.6455,
|
||
|
|
"mean_token_accuracy": 0.1496775045990944,
|
||
|
|
"num_tokens": 9065728.0,
|
||
|
|
"step": 4920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.809246253967285,
|
||
|
|
"epoch": 0.4137786179374081,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004987736790634719,
|
||
|
|
"loss": 5.6661,
|
||
|
|
"mean_token_accuracy": 0.15184428542852402,
|
||
|
|
"num_tokens": 9075522.0,
|
||
|
|
"step": 4925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.794481945037842,
|
||
|
|
"epoch": 0.414198697752573,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004987705547426568,
|
||
|
|
"loss": 5.6358,
|
||
|
|
"mean_token_accuracy": 0.1499626338481903,
|
||
|
|
"num_tokens": 9084412.0,
|
||
|
|
"step": 4930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.868565320968628,
|
||
|
|
"epoch": 0.41461877756773785,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004987674264578615,
|
||
|
|
"loss": 5.6942,
|
||
|
|
"mean_token_accuracy": 0.15214097648859023,
|
||
|
|
"num_tokens": 9094289.0,
|
||
|
|
"step": 4935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.820976829528808,
|
||
|
|
"epoch": 0.41503885738290275,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004987642942091414,
|
||
|
|
"loss": 5.6177,
|
||
|
|
"mean_token_accuracy": 0.15684758871793747,
|
||
|
|
"num_tokens": 9103124.0,
|
||
|
|
"step": 4940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.808840274810791,
|
||
|
|
"epoch": 0.41545893719806765,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004987611579965523,
|
||
|
|
"loss": 5.5534,
|
||
|
|
"mean_token_accuracy": 0.15804969370365143,
|
||
|
|
"num_tokens": 9112794.0,
|
||
|
|
"step": 4945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.837375354766846,
|
||
|
|
"epoch": 0.4158790170132325,
|
||
|
|
"grad_norm": 0.8359375,
|
||
|
|
"learning_rate": 0.0004987580178201492,
|
||
|
|
"loss": 5.7246,
|
||
|
|
"mean_token_accuracy": 0.16285934299230576,
|
||
|
|
"num_tokens": 9122718.0,
|
||
|
|
"step": 4950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.831628942489624,
|
||
|
|
"epoch": 0.4162990968283974,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004987548736799882,
|
||
|
|
"loss": 5.7454,
|
||
|
|
"mean_token_accuracy": 0.1529500514268875,
|
||
|
|
"num_tokens": 9131855.0,
|
||
|
|
"step": 4955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.798128986358643,
|
||
|
|
"epoch": 0.4167191766435623,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004987517255761248,
|
||
|
|
"loss": 5.6019,
|
||
|
|
"mean_token_accuracy": 0.1599896475672722,
|
||
|
|
"num_tokens": 9141102.0,
|
||
|
|
"step": 4960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.77801775932312,
|
||
|
|
"epoch": 0.4171392564587272,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004987485735086148,
|
||
|
|
"loss": 5.6601,
|
||
|
|
"mean_token_accuracy": 0.16009112149477006,
|
||
|
|
"num_tokens": 9150552.0,
|
||
|
|
"step": 4965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.852486228942871,
|
||
|
|
"epoch": 0.417559336273892,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.000498745417477514,
|
||
|
|
"loss": 5.657,
|
||
|
|
"mean_token_accuracy": 0.15402564853429795,
|
||
|
|
"num_tokens": 9160105.0,
|
||
|
|
"step": 4970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.779581785202026,
|
||
|
|
"epoch": 0.4179794160890569,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004987422574828784,
|
||
|
|
"loss": 5.6566,
|
||
|
|
"mean_token_accuracy": 0.15598243325948716,
|
||
|
|
"num_tokens": 9169367.0,
|
||
|
|
"step": 4975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.786018943786621,
|
||
|
|
"epoch": 0.4183994959042218,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.0004987390935247639,
|
||
|
|
"loss": 5.5264,
|
||
|
|
"mean_token_accuracy": 0.16368313133716583,
|
||
|
|
"num_tokens": 9177872.0,
|
||
|
|
"step": 4980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.82407512664795,
|
||
|
|
"epoch": 0.41881957571938666,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004987359256032265,
|
||
|
|
"loss": 5.7466,
|
||
|
|
"mean_token_accuracy": 0.151212839782238,
|
||
|
|
"num_tokens": 9187879.0,
|
||
|
|
"step": 4985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807058525085449,
|
||
|
|
"epoch": 0.41923965553455156,
|
||
|
|
"grad_norm": 0.8671875,
|
||
|
|
"learning_rate": 0.0004987327537183225,
|
||
|
|
"loss": 5.6561,
|
||
|
|
"mean_token_accuracy": 0.15415959805250168,
|
||
|
|
"num_tokens": 9198281.0,
|
||
|
|
"step": 4990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.805870008468628,
|
||
|
|
"epoch": 0.41965973534971646,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004987295778701078,
|
||
|
|
"loss": 5.6394,
|
||
|
|
"mean_token_accuracy": 0.16050323396921157,
|
||
|
|
"num_tokens": 9207670.0,
|
||
|
|
"step": 4995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.877247047424317,
|
||
|
|
"epoch": 0.42007981516488135,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.000498726398058639,
|
||
|
|
"loss": 5.6482,
|
||
|
|
"mean_token_accuracy": 0.16082072257995605,
|
||
|
|
"num_tokens": 9216995.0,
|
||
|
|
"step": 5000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.812716388702393,
|
||
|
|
"epoch": 0.4204998949800462,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.0004987232142839723,
|
||
|
|
"loss": 5.7482,
|
||
|
|
"mean_token_accuracy": 0.1490781858563423,
|
||
|
|
"num_tokens": 9227330.0,
|
||
|
|
"step": 5005
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.844203805923462,
|
||
|
|
"epoch": 0.4209199747952111,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004987200265461638,
|
||
|
|
"loss": 5.656,
|
||
|
|
"mean_token_accuracy": 0.16385895162820815,
|
||
|
|
"num_tokens": 9236666.0,
|
||
|
|
"step": 5010
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.85231499671936,
|
||
|
|
"epoch": 0.421340054610376,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004987168348452705,
|
||
|
|
"loss": 5.6595,
|
||
|
|
"mean_token_accuracy": 0.16210315823554994,
|
||
|
|
"num_tokens": 9246388.0,
|
||
|
|
"step": 5015
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.789185667037964,
|
||
|
|
"epoch": 0.42176013442554083,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004987136391813485,
|
||
|
|
"loss": 5.6096,
|
||
|
|
"mean_token_accuracy": 0.16511590033769608,
|
||
|
|
"num_tokens": 9255239.0,
|
||
|
|
"step": 5020
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.742922639846801,
|
||
|
|
"epoch": 0.42218021424070573,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004987104395544547,
|
||
|
|
"loss": 5.5924,
|
||
|
|
"mean_token_accuracy": 0.15797384828329086,
|
||
|
|
"num_tokens": 9264468.0,
|
||
|
|
"step": 5025
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.819699382781982,
|
||
|
|
"epoch": 0.42260029405587063,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004987072359646455,
|
||
|
|
"loss": 5.6607,
|
||
|
|
"mean_token_accuracy": 0.16205601245164872,
|
||
|
|
"num_tokens": 9274140.0,
|
||
|
|
"step": 5030
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.83985595703125,
|
||
|
|
"epoch": 0.42302037387103547,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004987040284119778,
|
||
|
|
"loss": 5.6327,
|
||
|
|
"mean_token_accuracy": 0.1588321939110756,
|
||
|
|
"num_tokens": 9283539.0,
|
||
|
|
"step": 5035
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.751109886169433,
|
||
|
|
"epoch": 0.42344045368620037,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004987008168965087,
|
||
|
|
"loss": 5.6403,
|
||
|
|
"mean_token_accuracy": 0.1550469622015953,
|
||
|
|
"num_tokens": 9292664.0,
|
||
|
|
"step": 5040
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.876785469055176,
|
||
|
|
"epoch": 0.42386053350136527,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004986976014182946,
|
||
|
|
"loss": 5.7374,
|
||
|
|
"mean_token_accuracy": 0.1531568393111229,
|
||
|
|
"num_tokens": 9302814.0,
|
||
|
|
"step": 5045
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890387535095215,
|
||
|
|
"epoch": 0.42428061331653016,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004986943819773927,
|
||
|
|
"loss": 5.7332,
|
||
|
|
"mean_token_accuracy": 0.15649186819791794,
|
||
|
|
"num_tokens": 9312654.0,
|
||
|
|
"step": 5050
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8707475662231445,
|
||
|
|
"epoch": 0.424700693131695,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.00049869115857386,
|
||
|
|
"loss": 5.7558,
|
||
|
|
"mean_token_accuracy": 0.14800945520401002,
|
||
|
|
"num_tokens": 9322271.0,
|
||
|
|
"step": 5055
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.878791618347168,
|
||
|
|
"epoch": 0.4251207729468599,
|
||
|
|
"grad_norm": 0.86328125,
|
||
|
|
"learning_rate": 0.0004986879312077536,
|
||
|
|
"loss": 5.688,
|
||
|
|
"mean_token_accuracy": 0.15585887283086777,
|
||
|
|
"num_tokens": 9331341.0,
|
||
|
|
"step": 5060
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.796487426757812,
|
||
|
|
"epoch": 0.4255408527620248,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004986846998791308,
|
||
|
|
"loss": 5.6274,
|
||
|
|
"mean_token_accuracy": 0.15625337660312652,
|
||
|
|
"num_tokens": 9339863.0,
|
||
|
|
"step": 5065
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.72486629486084,
|
||
|
|
"epoch": 0.42596093257718964,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004986814645880485,
|
||
|
|
"loss": 5.5974,
|
||
|
|
"mean_token_accuracy": 0.16185437515377998,
|
||
|
|
"num_tokens": 9349488.0,
|
||
|
|
"step": 5070
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7803843975067135,
|
||
|
|
"epoch": 0.42638101239235454,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004986782253345645,
|
||
|
|
"loss": 5.6105,
|
||
|
|
"mean_token_accuracy": 0.15332376062870026,
|
||
|
|
"num_tokens": 9357977.0,
|
||
|
|
"step": 5075
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.823932743072509,
|
||
|
|
"epoch": 0.42680109220751944,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004986749821187358,
|
||
|
|
"loss": 5.7156,
|
||
|
|
"mean_token_accuracy": 0.15630935728549958,
|
||
|
|
"num_tokens": 9367449.0,
|
||
|
|
"step": 5080
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89394211769104,
|
||
|
|
"epoch": 0.42722117202268434,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.00049867173494062,
|
||
|
|
"loss": 5.7321,
|
||
|
|
"mean_token_accuracy": 0.15639646500349044,
|
||
|
|
"num_tokens": 9377070.0,
|
||
|
|
"step": 5085
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.765441846847534,
|
||
|
|
"epoch": 0.4276412518378492,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004986684838002744,
|
||
|
|
"loss": 5.5217,
|
||
|
|
"mean_token_accuracy": 0.15419476479291916,
|
||
|
|
"num_tokens": 9385881.0,
|
||
|
|
"step": 5090
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.770947122573853,
|
||
|
|
"epoch": 0.4280613316530141,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004986652286977569,
|
||
|
|
"loss": 5.6523,
|
||
|
|
"mean_token_accuracy": 0.15255010426044463,
|
||
|
|
"num_tokens": 9395159.0,
|
||
|
|
"step": 5095
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.805099630355835,
|
||
|
|
"epoch": 0.428481411468179,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004986619696331252,
|
||
|
|
"loss": 5.6045,
|
||
|
|
"mean_token_accuracy": 0.1583484500646591,
|
||
|
|
"num_tokens": 9404590.0,
|
||
|
|
"step": 5100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.841793823242187,
|
||
|
|
"epoch": 0.4289014912833438,
|
||
|
|
"grad_norm": 0.8515625,
|
||
|
|
"learning_rate": 0.0004986587066064367,
|
||
|
|
"loss": 5.6238,
|
||
|
|
"mean_token_accuracy": 0.1618543565273285,
|
||
|
|
"num_tokens": 9414452.0,
|
||
|
|
"step": 5105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.882272624969483,
|
||
|
|
"epoch": 0.4293215710985087,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004986554396177494,
|
||
|
|
"loss": 5.7691,
|
||
|
|
"mean_token_accuracy": 0.1512654058635235,
|
||
|
|
"num_tokens": 9424004.0,
|
||
|
|
"step": 5110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826911163330078,
|
||
|
|
"epoch": 0.4297416509136736,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004986521686671212,
|
||
|
|
"loss": 5.6377,
|
||
|
|
"mean_token_accuracy": 0.16602189987897872,
|
||
|
|
"num_tokens": 9433487.0,
|
||
|
|
"step": 5115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.761785840988159,
|
||
|
|
"epoch": 0.43016173072883845,
|
||
|
|
"grad_norm": 1.046875,
|
||
|
|
"learning_rate": 0.00049864889375461,
|
||
|
|
"loss": 5.701,
|
||
|
|
"mean_token_accuracy": 0.15255770534276963,
|
||
|
|
"num_tokens": 9442742.0,
|
||
|
|
"step": 5120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.816967296600342,
|
||
|
|
"epoch": 0.43058181054400335,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004986456148802738,
|
||
|
|
"loss": 5.7673,
|
||
|
|
"mean_token_accuracy": 0.15205237418413162,
|
||
|
|
"num_tokens": 9452550.0,
|
||
|
|
"step": 5125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.930779886245728,
|
||
|
|
"epoch": 0.43100189035916825,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004986423320441707,
|
||
|
|
"loss": 5.7143,
|
||
|
|
"mean_token_accuracy": 0.14957663267850876,
|
||
|
|
"num_tokens": 9461920.0,
|
||
|
|
"step": 5130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818691873550415,
|
||
|
|
"epoch": 0.43142197017433315,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004986390452463588,
|
||
|
|
"loss": 5.6211,
|
||
|
|
"mean_token_accuracy": 0.15580169409513472,
|
||
|
|
"num_tokens": 9470817.0,
|
||
|
|
"step": 5135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.700370407104492,
|
||
|
|
"epoch": 0.431842049989498,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004986357544868964,
|
||
|
|
"loss": 5.5801,
|
||
|
|
"mean_token_accuracy": 0.1596447467803955,
|
||
|
|
"num_tokens": 9479936.0,
|
||
|
|
"step": 5140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.841777086257935,
|
||
|
|
"epoch": 0.4322621298046629,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004986324597658418,
|
||
|
|
"loss": 5.6155,
|
||
|
|
"mean_token_accuracy": 0.16243926435709,
|
||
|
|
"num_tokens": 9489818.0,
|
||
|
|
"step": 5145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.728731489181518,
|
||
|
|
"epoch": 0.4326822096198278,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004986291610832533,
|
||
|
|
"loss": 5.624,
|
||
|
|
"mean_token_accuracy": 0.153781495988369,
|
||
|
|
"num_tokens": 9499688.0,
|
||
|
|
"step": 5150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918451547622681,
|
||
|
|
"epoch": 0.4331022894349926,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004986258584391892,
|
||
|
|
"loss": 5.6774,
|
||
|
|
"mean_token_accuracy": 0.15540721267461777,
|
||
|
|
"num_tokens": 9509581.0,
|
||
|
|
"step": 5155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.923600053787231,
|
||
|
|
"epoch": 0.4335223692501575,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004986225518337084,
|
||
|
|
"loss": 5.7525,
|
||
|
|
"mean_token_accuracy": 0.15666318088769912,
|
||
|
|
"num_tokens": 9518556.0,
|
||
|
|
"step": 5160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.714486789703369,
|
||
|
|
"epoch": 0.4339424490653224,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004986192412668692,
|
||
|
|
"loss": 5.6587,
|
||
|
|
"mean_token_accuracy": 0.1547637924551964,
|
||
|
|
"num_tokens": 9527612.0,
|
||
|
|
"step": 5165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.787137269973755,
|
||
|
|
"epoch": 0.4343625288804873,
|
||
|
|
"grad_norm": 1.03125,
|
||
|
|
"learning_rate": 0.0004986159267387302,
|
||
|
|
"loss": 5.5546,
|
||
|
|
"mean_token_accuracy": 0.16138194501399994,
|
||
|
|
"num_tokens": 9535882.0,
|
||
|
|
"step": 5170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.797946739196777,
|
||
|
|
"epoch": 0.43478260869565216,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004986126082493502,
|
||
|
|
"loss": 5.656,
|
||
|
|
"mean_token_accuracy": 0.1613065406680107,
|
||
|
|
"num_tokens": 9544799.0,
|
||
|
|
"step": 5175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.779606723785401,
|
||
|
|
"epoch": 0.43520268851081706,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004986092857987881,
|
||
|
|
"loss": 5.5729,
|
||
|
|
"mean_token_accuracy": 0.1618928477168083,
|
||
|
|
"num_tokens": 9553805.0,
|
||
|
|
"step": 5180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.782668399810791,
|
||
|
|
"epoch": 0.43562276832598196,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004986059593871026,
|
||
|
|
"loss": 5.5971,
|
||
|
|
"mean_token_accuracy": 0.1598972573876381,
|
||
|
|
"num_tokens": 9563493.0,
|
||
|
|
"step": 5185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.800241613388062,
|
||
|
|
"epoch": 0.4360428481411468,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004986026290143527,
|
||
|
|
"loss": 5.6842,
|
||
|
|
"mean_token_accuracy": 0.15388598516583443,
|
||
|
|
"num_tokens": 9572297.0,
|
||
|
|
"step": 5190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.936120653152466,
|
||
|
|
"epoch": 0.4364629279563117,
|
||
|
|
"grad_norm": 1.0390625,
|
||
|
|
"learning_rate": 0.0004985992946805973,
|
||
|
|
"loss": 5.8134,
|
||
|
|
"mean_token_accuracy": 0.15065453350543975,
|
||
|
|
"num_tokens": 9581967.0,
|
||
|
|
"step": 5195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.819184160232544,
|
||
|
|
"epoch": 0.4368830077714766,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004985959563858955,
|
||
|
|
"loss": 5.7273,
|
||
|
|
"mean_token_accuracy": 0.16100031584501268,
|
||
|
|
"num_tokens": 9590885.0,
|
||
|
|
"step": 5200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.860151624679565,
|
||
|
|
"epoch": 0.43730308758664144,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004985926141303066,
|
||
|
|
"loss": 5.6532,
|
||
|
|
"mean_token_accuracy": 0.1567025899887085,
|
||
|
|
"num_tokens": 9599247.0,
|
||
|
|
"step": 5205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818394136428833,
|
||
|
|
"epoch": 0.43772316740180633,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004985892679138896,
|
||
|
|
"loss": 5.571,
|
||
|
|
"mean_token_accuracy": 0.16371893361210824,
|
||
|
|
"num_tokens": 9608296.0,
|
||
|
|
"step": 5210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8166498184204105,
|
||
|
|
"epoch": 0.43814324721697123,
|
||
|
|
"grad_norm": 1.1015625,
|
||
|
|
"learning_rate": 0.0004985859177367038,
|
||
|
|
"loss": 5.6242,
|
||
|
|
"mean_token_accuracy": 0.15776645839214326,
|
||
|
|
"num_tokens": 9616734.0,
|
||
|
|
"step": 5215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.83067135810852,
|
||
|
|
"epoch": 0.43856332703213613,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004985825635988087,
|
||
|
|
"loss": 5.699,
|
||
|
|
"mean_token_accuracy": 0.1571464478969574,
|
||
|
|
"num_tokens": 9626246.0,
|
||
|
|
"step": 5220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7702131271362305,
|
||
|
|
"epoch": 0.43898340684730097,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004985792055002635,
|
||
|
|
"loss": 5.5794,
|
||
|
|
"mean_token_accuracy": 0.16028426140546798,
|
||
|
|
"num_tokens": 9634963.0,
|
||
|
|
"step": 5225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8400349617004395,
|
||
|
|
"epoch": 0.43940348666246587,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004985758434411278,
|
||
|
|
"loss": 5.6513,
|
||
|
|
"mean_token_accuracy": 0.16422291100025177,
|
||
|
|
"num_tokens": 9643615.0,
|
||
|
|
"step": 5230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.810837030410767,
|
||
|
|
"epoch": 0.43982356647763077,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004985724774214613,
|
||
|
|
"loss": 5.6244,
|
||
|
|
"mean_token_accuracy": 0.15992441177368164,
|
||
|
|
"num_tokens": 9653306.0,
|
||
|
|
"step": 5235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.767703294754028,
|
||
|
|
"epoch": 0.4402436462927956,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004985691074413233,
|
||
|
|
"loss": 5.6505,
|
||
|
|
"mean_token_accuracy": 0.15613847076892853,
|
||
|
|
"num_tokens": 9662389.0,
|
||
|
|
"step": 5240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.753371381759644,
|
||
|
|
"epoch": 0.4406637261079605,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004985657335007739,
|
||
|
|
"loss": 5.6446,
|
||
|
|
"mean_token_accuracy": 0.15534982979297637,
|
||
|
|
"num_tokens": 9671183.0,
|
||
|
|
"step": 5245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.836323595046997,
|
||
|
|
"epoch": 0.4410838059231254,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004985623555998725,
|
||
|
|
"loss": 5.6222,
|
||
|
|
"mean_token_accuracy": 0.16474147886037827,
|
||
|
|
"num_tokens": 9680544.0,
|
||
|
|
"step": 5250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.819104290008545,
|
||
|
|
"epoch": 0.4415038857382903,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004985589737386791,
|
||
|
|
"loss": 5.6779,
|
||
|
|
"mean_token_accuracy": 0.15779446437954903,
|
||
|
|
"num_tokens": 9690137.0,
|
||
|
|
"step": 5255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.74895076751709,
|
||
|
|
"epoch": 0.44192396555345514,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004985555879172535,
|
||
|
|
"loss": 5.6131,
|
||
|
|
"mean_token_accuracy": 0.16228249818086624,
|
||
|
|
"num_tokens": 9699149.0,
|
||
|
|
"step": 5260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830872917175293,
|
||
|
|
"epoch": 0.44234404536862004,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.000498552198135656,
|
||
|
|
"loss": 5.6857,
|
||
|
|
"mean_token_accuracy": 0.16091985404491424,
|
||
|
|
"num_tokens": 9709308.0,
|
||
|
|
"step": 5265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.817913627624511,
|
||
|
|
"epoch": 0.44276412518378494,
|
||
|
|
"grad_norm": 1.15625,
|
||
|
|
"learning_rate": 0.0004985488043939462,
|
||
|
|
"loss": 5.6133,
|
||
|
|
"mean_token_accuracy": 0.15377137959003448,
|
||
|
|
"num_tokens": 9718462.0,
|
||
|
|
"step": 5270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.762473201751709,
|
||
|
|
"epoch": 0.4431842049989498,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004985454066921846,
|
||
|
|
"loss": 5.5442,
|
||
|
|
"mean_token_accuracy": 0.16455349177122117,
|
||
|
|
"num_tokens": 9727626.0,
|
||
|
|
"step": 5275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.663512516021728,
|
||
|
|
"epoch": 0.4436042848141147,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004985420050304312,
|
||
|
|
"loss": 5.5827,
|
||
|
|
"mean_token_accuracy": 0.15936666429042817,
|
||
|
|
"num_tokens": 9737091.0,
|
||
|
|
"step": 5280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.770118761062622,
|
||
|
|
"epoch": 0.4440243646292796,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004985385994087462,
|
||
|
|
"loss": 5.6417,
|
||
|
|
"mean_token_accuracy": 0.1584844209253788,
|
||
|
|
"num_tokens": 9746135.0,
|
||
|
|
"step": 5285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.844138050079346,
|
||
|
|
"epoch": 0.4444444444444444,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004985351898271901,
|
||
|
|
"loss": 5.5853,
|
||
|
|
"mean_token_accuracy": 0.1622116059064865,
|
||
|
|
"num_tokens": 9754549.0,
|
||
|
|
"step": 5290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.83607120513916,
|
||
|
|
"epoch": 0.4448645242596093,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004985317762858231,
|
||
|
|
"loss": 5.7065,
|
||
|
|
"mean_token_accuracy": 0.1499613419175148,
|
||
|
|
"num_tokens": 9764219.0,
|
||
|
|
"step": 5295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.792026853561401,
|
||
|
|
"epoch": 0.4452846040747742,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.000498528358784706,
|
||
|
|
"loss": 5.5519,
|
||
|
|
"mean_token_accuracy": 0.1638228639960289,
|
||
|
|
"num_tokens": 9772234.0,
|
||
|
|
"step": 5300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.749575090408325,
|
||
|
|
"epoch": 0.4457046838899391,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.000498524937323899,
|
||
|
|
"loss": 5.6106,
|
||
|
|
"mean_token_accuracy": 0.16515014916658402,
|
||
|
|
"num_tokens": 9781417.0,
|
||
|
|
"step": 5305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.9356084823608395,
|
||
|
|
"epoch": 0.44612476370510395,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004985215119034628,
|
||
|
|
"loss": 5.7505,
|
||
|
|
"mean_token_accuracy": 0.14851112440228462,
|
||
|
|
"num_tokens": 9791286.0,
|
||
|
|
"step": 5310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8016856670379635,
|
||
|
|
"epoch": 0.44654484352026885,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004985180825234582,
|
||
|
|
"loss": 5.7329,
|
||
|
|
"mean_token_accuracy": 0.15573213249444962,
|
||
|
|
"num_tokens": 9802157.0,
|
||
|
|
"step": 5315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.89680552482605,
|
||
|
|
"epoch": 0.44696492333543375,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004985146491839459,
|
||
|
|
"loss": 5.7173,
|
||
|
|
"mean_token_accuracy": 0.1475129798054695,
|
||
|
|
"num_tokens": 9812646.0,
|
||
|
|
"step": 5320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.870607805252075,
|
||
|
|
"epoch": 0.4473850031505986,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004985112118849865,
|
||
|
|
"loss": 5.7088,
|
||
|
|
"mean_token_accuracy": 0.15120236873626708,
|
||
|
|
"num_tokens": 9822274.0,
|
||
|
|
"step": 5325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.753091526031494,
|
||
|
|
"epoch": 0.4478050829657635,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004985077706266412,
|
||
|
|
"loss": 5.5294,
|
||
|
|
"mean_token_accuracy": 0.15791643261909485,
|
||
|
|
"num_tokens": 9831337.0,
|
||
|
|
"step": 5330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.79245548248291,
|
||
|
|
"epoch": 0.4482251627809284,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004985043254089708,
|
||
|
|
"loss": 5.6629,
|
||
|
|
"mean_token_accuracy": 0.15153390020132065,
|
||
|
|
"num_tokens": 9840798.0,
|
||
|
|
"step": 5335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.723747682571411,
|
||
|
|
"epoch": 0.44864524259609323,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004985008762320364,
|
||
|
|
"loss": 5.637,
|
||
|
|
"mean_token_accuracy": 0.15859152227640153,
|
||
|
|
"num_tokens": 9850117.0,
|
||
|
|
"step": 5340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.79846601486206,
|
||
|
|
"epoch": 0.4490653224112581,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.000498497423095899,
|
||
|
|
"loss": 5.5724,
|
||
|
|
"mean_token_accuracy": 0.16569938510656357,
|
||
|
|
"num_tokens": 9858227.0,
|
||
|
|
"step": 5345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.755469799041748,
|
||
|
|
"epoch": 0.449485402226423,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004984939660006199,
|
||
|
|
"loss": 5.6759,
|
||
|
|
"mean_token_accuracy": 0.15846239179372787,
|
||
|
|
"num_tokens": 9867157.0,
|
||
|
|
"step": 5350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7474853515625,
|
||
|
|
"epoch": 0.4499054820415879,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004984905049462602,
|
||
|
|
"loss": 5.5876,
|
||
|
|
"mean_token_accuracy": 0.15728517472743989,
|
||
|
|
"num_tokens": 9877045.0,
|
||
|
|
"step": 5355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918812370300293,
|
||
|
|
"epoch": 0.45032556185675277,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004984870399328814,
|
||
|
|
"loss": 5.7228,
|
||
|
|
"mean_token_accuracy": 0.15240922719240188,
|
||
|
|
"num_tokens": 9886637.0,
|
||
|
|
"step": 5360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.742618703842163,
|
||
|
|
"epoch": 0.45074564167191766,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004984835709605446,
|
||
|
|
"loss": 5.5883,
|
||
|
|
"mean_token_accuracy": 0.16404919177293778,
|
||
|
|
"num_tokens": 9895601.0,
|
||
|
|
"step": 5365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8194098472595215,
|
||
|
|
"epoch": 0.45116572148708256,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004984800980293116,
|
||
|
|
"loss": 5.738,
|
||
|
|
"mean_token_accuracy": 0.1579892724752426,
|
||
|
|
"num_tokens": 9904775.0,
|
||
|
|
"step": 5370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.780790996551514,
|
||
|
|
"epoch": 0.4515858013022474,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004984766211392435,
|
||
|
|
"loss": 5.6783,
|
||
|
|
"mean_token_accuracy": 0.15692917853593827,
|
||
|
|
"num_tokens": 9913795.0,
|
||
|
|
"step": 5375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.802691316604614,
|
||
|
|
"epoch": 0.4520058811174123,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004984731402904024,
|
||
|
|
"loss": 5.5113,
|
||
|
|
"mean_token_accuracy": 0.16487460136413573,
|
||
|
|
"num_tokens": 9922576.0,
|
||
|
|
"step": 5380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.772703742980957,
|
||
|
|
"epoch": 0.4524259609325772,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004984696554828496,
|
||
|
|
"loss": 5.4922,
|
||
|
|
"mean_token_accuracy": 0.1670244887471199,
|
||
|
|
"num_tokens": 9930971.0,
|
||
|
|
"step": 5385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.794325065612793,
|
||
|
|
"epoch": 0.4528460407477421,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004984661667166468,
|
||
|
|
"loss": 5.6128,
|
||
|
|
"mean_token_accuracy": 0.16192587018013,
|
||
|
|
"num_tokens": 9939628.0,
|
||
|
|
"step": 5390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7850220680236815,
|
||
|
|
"epoch": 0.45326612056290694,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004984626739918561,
|
||
|
|
"loss": 5.5903,
|
||
|
|
"mean_token_accuracy": 0.16074153482913972,
|
||
|
|
"num_tokens": 9948397.0,
|
||
|
|
"step": 5395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814194774627685,
|
||
|
|
"epoch": 0.45368620037807184,
|
||
|
|
"grad_norm": 0.87890625,
|
||
|
|
"learning_rate": 0.0004984591773085391,
|
||
|
|
"loss": 5.67,
|
||
|
|
"mean_token_accuracy": 0.15753872096538543,
|
||
|
|
"num_tokens": 9957683.0,
|
||
|
|
"step": 5400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814547824859619,
|
||
|
|
"epoch": 0.45410628019323673,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004984556766667578,
|
||
|
|
"loss": 5.6587,
|
||
|
|
"mean_token_accuracy": 0.1586209386587143,
|
||
|
|
"num_tokens": 9966756.0,
|
||
|
|
"step": 5405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.744683790206909,
|
||
|
|
"epoch": 0.4545263600084016,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004984521720665743,
|
||
|
|
"loss": 5.6532,
|
||
|
|
"mean_token_accuracy": 0.16073551923036575,
|
||
|
|
"num_tokens": 9976000.0,
|
||
|
|
"step": 5410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.857652235031128,
|
||
|
|
"epoch": 0.4549464398235665,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004984486635080507,
|
||
|
|
"loss": 5.6506,
|
||
|
|
"mean_token_accuracy": 0.15694389641284942,
|
||
|
|
"num_tokens": 9985509.0,
|
||
|
|
"step": 5415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7711996078491214,
|
||
|
|
"epoch": 0.45536651963873137,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004984451509912489,
|
||
|
|
"loss": 5.5899,
|
||
|
|
"mean_token_accuracy": 0.1618253692984581,
|
||
|
|
"num_tokens": 9994342.0,
|
||
|
|
"step": 5420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.746224308013916,
|
||
|
|
"epoch": 0.4557865994538962,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004984416345162315,
|
||
|
|
"loss": 5.6478,
|
||
|
|
"mean_token_accuracy": 0.15566404908895493,
|
||
|
|
"num_tokens": 10004249.0,
|
||
|
|
"step": 5425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.76487717628479,
|
||
|
|
"epoch": 0.4562066792690611,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004984381140830605,
|
||
|
|
"loss": 5.6061,
|
||
|
|
"mean_token_accuracy": 0.16023263484239578,
|
||
|
|
"num_tokens": 10012430.0,
|
||
|
|
"step": 5430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.82148494720459,
|
||
|
|
"epoch": 0.456626759084226,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004984345896917984,
|
||
|
|
"loss": 5.615,
|
||
|
|
"mean_token_accuracy": 0.15671578347682952,
|
||
|
|
"num_tokens": 10021434.0,
|
||
|
|
"step": 5435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7957844734191895,
|
||
|
|
"epoch": 0.4570468388993909,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004984310613425076,
|
||
|
|
"loss": 5.6077,
|
||
|
|
"mean_token_accuracy": 0.16273672878742218,
|
||
|
|
"num_tokens": 10030473.0,
|
||
|
|
"step": 5440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7984706401824955,
|
||
|
|
"epoch": 0.45746691871455575,
|
||
|
|
"grad_norm": 1.109375,
|
||
|
|
"learning_rate": 0.0004984275290352506,
|
||
|
|
"loss": 5.6027,
|
||
|
|
"mean_token_accuracy": 0.16592728793621064,
|
||
|
|
"num_tokens": 10039057.0,
|
||
|
|
"step": 5445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.82614917755127,
|
||
|
|
"epoch": 0.45788699852972065,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004984239927700899,
|
||
|
|
"loss": 5.6993,
|
||
|
|
"mean_token_accuracy": 0.15564172416925431,
|
||
|
|
"num_tokens": 10047998.0,
|
||
|
|
"step": 5450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.890322923660278,
|
||
|
|
"epoch": 0.45830707834488554,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004984204525470883,
|
||
|
|
"loss": 5.6293,
|
||
|
|
"mean_token_accuracy": 0.1547103099524975,
|
||
|
|
"num_tokens": 10057479.0,
|
||
|
|
"step": 5455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.735934209823609,
|
||
|
|
"epoch": 0.4587271581600504,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004984169083663084,
|
||
|
|
"loss": 5.6068,
|
||
|
|
"mean_token_accuracy": 0.1534338653087616,
|
||
|
|
"num_tokens": 10067754.0,
|
||
|
|
"step": 5460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.795390987396241,
|
||
|
|
"epoch": 0.4591472379752153,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004984133602278129,
|
||
|
|
"loss": 5.6835,
|
||
|
|
"mean_token_accuracy": 0.157898972928524,
|
||
|
|
"num_tokens": 10076815.0,
|
||
|
|
"step": 5465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.918915462493897,
|
||
|
|
"epoch": 0.4595673177903802,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.000498409808131665,
|
||
|
|
"loss": 5.6866,
|
||
|
|
"mean_token_accuracy": 0.15232098400592803,
|
||
|
|
"num_tokens": 10086300.0,
|
||
|
|
"step": 5470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7501527786254885,
|
||
|
|
"epoch": 0.4599873976055451,
|
||
|
|
"grad_norm": 0.8828125,
|
||
|
|
"learning_rate": 0.0004984062520779272,
|
||
|
|
"loss": 5.5857,
|
||
|
|
"mean_token_accuracy": 0.16250389367341994,
|
||
|
|
"num_tokens": 10095383.0,
|
||
|
|
"step": 5475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.6954700469970705,
|
||
|
|
"epoch": 0.4604074774207099,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.0004984026920666628,
|
||
|
|
"loss": 5.5697,
|
||
|
|
"mean_token_accuracy": 0.15912551581859588,
|
||
|
|
"num_tokens": 10103971.0,
|
||
|
|
"step": 5480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814951801300049,
|
||
|
|
"epoch": 0.4608275572358748,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004983991280979347,
|
||
|
|
"loss": 5.5799,
|
||
|
|
"mean_token_accuracy": 0.16145333349704744,
|
||
|
|
"num_tokens": 10113028.0,
|
||
|
|
"step": 5485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.79097695350647,
|
||
|
|
"epoch": 0.4612476370510397,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004983955601718061,
|
||
|
|
"loss": 5.5408,
|
||
|
|
"mean_token_accuracy": 0.16365961581468583,
|
||
|
|
"num_tokens": 10121890.0,
|
||
|
|
"step": 5490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.804393863677978,
|
||
|
|
"epoch": 0.46166771686620456,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004983919882883401,
|
||
|
|
"loss": 5.6663,
|
||
|
|
"mean_token_accuracy": 0.1603729695081711,
|
||
|
|
"num_tokens": 10131655.0,
|
||
|
|
"step": 5495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.873544406890869,
|
||
|
|
"epoch": 0.46208779668136946,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004983884124476,
|
||
|
|
"loss": 5.6699,
|
||
|
|
"mean_token_accuracy": 0.15749045610427856,
|
||
|
|
"num_tokens": 10140778.0,
|
||
|
|
"step": 5500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.814252138137817,
|
||
|
|
"epoch": 0.46250787649653435,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004983848326496494,
|
||
|
|
"loss": 5.7045,
|
||
|
|
"mean_token_accuracy": 0.15820754915475846,
|
||
|
|
"num_tokens": 10150229.0,
|
||
|
|
"step": 5505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.815248012542725,
|
||
|
|
"epoch": 0.4629279563116992,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004983812488945513,
|
||
|
|
"loss": 5.6102,
|
||
|
|
"mean_token_accuracy": 0.15927310138940812,
|
||
|
|
"num_tokens": 10158939.0,
|
||
|
|
"step": 5510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.772242593765259,
|
||
|
|
"epoch": 0.4633480361268641,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004983776611823696,
|
||
|
|
"loss": 5.6172,
|
||
|
|
"mean_token_accuracy": 0.15591025203466416,
|
||
|
|
"num_tokens": 10168383.0,
|
||
|
|
"step": 5515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.762513542175293,
|
||
|
|
"epoch": 0.463768115942029,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004983740695131676,
|
||
|
|
"loss": 5.614,
|
||
|
|
"mean_token_accuracy": 0.16522103548049927,
|
||
|
|
"num_tokens": 10178678.0,
|
||
|
|
"step": 5520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.78189172744751,
|
||
|
|
"epoch": 0.4641881957571939,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.000498370473887009,
|
||
|
|
"loss": 5.5993,
|
||
|
|
"mean_token_accuracy": 0.1618872195482254,
|
||
|
|
"num_tokens": 10188964.0,
|
||
|
|
"step": 5525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.797432947158813,
|
||
|
|
"epoch": 0.46460827557235873,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004983668743039573,
|
||
|
|
"loss": 5.626,
|
||
|
|
"mean_token_accuracy": 0.16132238358259202,
|
||
|
|
"num_tokens": 10198333.0,
|
||
|
|
"step": 5530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7464605331420895,
|
||
|
|
"epoch": 0.46502835538752363,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004983632707640766,
|
||
|
|
"loss": 5.6385,
|
||
|
|
"mean_token_accuracy": 0.15782831460237504,
|
||
|
|
"num_tokens": 10207876.0,
|
||
|
|
"step": 5535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7676252841949465,
|
||
|
|
"epoch": 0.4654484352026885,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004983596632674306,
|
||
|
|
"loss": 5.5836,
|
||
|
|
"mean_token_accuracy": 0.15963911265134811,
|
||
|
|
"num_tokens": 10216822.0,
|
||
|
|
"step": 5540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.864213514328003,
|
||
|
|
"epoch": 0.46586851501785337,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004983560518140831,
|
||
|
|
"loss": 5.6988,
|
||
|
|
"mean_token_accuracy": 0.15088534951210023,
|
||
|
|
"num_tokens": 10226887.0,
|
||
|
|
"step": 5545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.807913875579834,
|
||
|
|
"epoch": 0.46628859483301827,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004983524364040982,
|
||
|
|
"loss": 5.5379,
|
||
|
|
"mean_token_accuracy": 0.16848834306001664,
|
||
|
|
"num_tokens": 10235935.0,
|
||
|
|
"step": 5550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.751170539855957,
|
||
|
|
"epoch": 0.46670867464818316,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004983488170375399,
|
||
|
|
"loss": 5.5025,
|
||
|
|
"mean_token_accuracy": 0.16097538769245148,
|
||
|
|
"num_tokens": 10245590.0,
|
||
|
|
"step": 5555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.752688026428222,
|
||
|
|
"epoch": 0.46712875446334806,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004983451937144723,
|
||
|
|
"loss": 5.5925,
|
||
|
|
"mean_token_accuracy": 0.15908439457416534,
|
||
|
|
"num_tokens": 10255104.0,
|
||
|
|
"step": 5560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.625225067138672,
|
||
|
|
"epoch": 0.4675488342785129,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004983415664349595,
|
||
|
|
"loss": 5.4479,
|
||
|
|
"mean_token_accuracy": 0.16906733959913253,
|
||
|
|
"num_tokens": 10264236.0,
|
||
|
|
"step": 5565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.791613006591797,
|
||
|
|
"epoch": 0.4679689140936778,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004983379351990659,
|
||
|
|
"loss": 5.5634,
|
||
|
|
"mean_token_accuracy": 0.16491406708955764,
|
||
|
|
"num_tokens": 10273335.0,
|
||
|
|
"step": 5570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.73756160736084,
|
||
|
|
"epoch": 0.4683889939088427,
|
||
|
|
"grad_norm": 0.83203125,
|
||
|
|
"learning_rate": 0.0004983343000068559,
|
||
|
|
"loss": 5.5392,
|
||
|
|
"mean_token_accuracy": 0.16353048831224443,
|
||
|
|
"num_tokens": 10282206.0,
|
||
|
|
"step": 5575
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.679240655899048,
|
||
|
|
"epoch": 0.46880907372400754,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004983306608583937,
|
||
|
|
"loss": 5.4798,
|
||
|
|
"mean_token_accuracy": 0.17844018042087556,
|
||
|
|
"num_tokens": 10290056.0,
|
||
|
|
"step": 5580
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.697105741500854,
|
||
|
|
"epoch": 0.46922915353917244,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004983270177537438,
|
||
|
|
"loss": 5.5596,
|
||
|
|
"mean_token_accuracy": 0.16428319364786148,
|
||
|
|
"num_tokens": 10299726.0,
|
||
|
|
"step": 5585
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.741534852981568,
|
||
|
|
"epoch": 0.46964923335433734,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004983233706929708,
|
||
|
|
"loss": 5.6128,
|
||
|
|
"mean_token_accuracy": 0.1574200913310051,
|
||
|
|
"num_tokens": 10308696.0,
|
||
|
|
"step": 5590
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.87669529914856,
|
||
|
|
"epoch": 0.4700693131695022,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004983197196761392,
|
||
|
|
"loss": 5.706,
|
||
|
|
"mean_token_accuracy": 0.1552853010594845,
|
||
|
|
"num_tokens": 10317845.0,
|
||
|
|
"step": 5595
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.774369955062866,
|
||
|
|
"epoch": 0.4704893929846671,
|
||
|
|
"grad_norm": 1.0546875,
|
||
|
|
"learning_rate": 0.0004983160647033139,
|
||
|
|
"loss": 5.5975,
|
||
|
|
"mean_token_accuracy": 0.16107087433338166,
|
||
|
|
"num_tokens": 10326563.0,
|
||
|
|
"step": 5600
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.75340576171875,
|
||
|
|
"epoch": 0.470909472799832,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004983124057745595,
|
||
|
|
"loss": 5.5791,
|
||
|
|
"mean_token_accuracy": 0.15735821723937987,
|
||
|
|
"num_tokens": 10335931.0,
|
||
|
|
"step": 5605
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.707799339294434,
|
||
|
|
"epoch": 0.47132955261499687,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004983087428899408,
|
||
|
|
"loss": 5.5773,
|
||
|
|
"mean_token_accuracy": 0.15221105068922042,
|
||
|
|
"num_tokens": 10344984.0,
|
||
|
|
"step": 5610
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7647332668304445,
|
||
|
|
"epoch": 0.4717496324301617,
|
||
|
|
"grad_norm": 1.09375,
|
||
|
|
"learning_rate": 0.0004983050760495227,
|
||
|
|
"loss": 5.5966,
|
||
|
|
"mean_token_accuracy": 0.1603370040655136,
|
||
|
|
"num_tokens": 10353522.0,
|
||
|
|
"step": 5615
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7834312438964846,
|
||
|
|
"epoch": 0.4721697122453266,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004983014052533702,
|
||
|
|
"loss": 5.6121,
|
||
|
|
"mean_token_accuracy": 0.15812979638576508,
|
||
|
|
"num_tokens": 10363527.0,
|
||
|
|
"step": 5620
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.723613166809082,
|
||
|
|
"epoch": 0.4725897920604915,
|
||
|
|
"grad_norm": 0.88671875,
|
||
|
|
"learning_rate": 0.0004982977305015481,
|
||
|
|
"loss": 5.5439,
|
||
|
|
"mean_token_accuracy": 0.15958572328090667,
|
||
|
|
"num_tokens": 10372040.0,
|
||
|
|
"step": 5625
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.772522401809693,
|
||
|
|
"epoch": 0.47300987187565635,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004982940517941219,
|
||
|
|
"loss": 5.5227,
|
||
|
|
"mean_token_accuracy": 0.16043394133448602,
|
||
|
|
"num_tokens": 10381279.0,
|
||
|
|
"step": 5630
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.790616703033447,
|
||
|
|
"epoch": 0.47342995169082125,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004982903691311564,
|
||
|
|
"loss": 5.6984,
|
||
|
|
"mean_token_accuracy": 0.15549325048923493,
|
||
|
|
"num_tokens": 10390608.0,
|
||
|
|
"step": 5635
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.768335485458374,
|
||
|
|
"epoch": 0.47385003150598615,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004982866825127172,
|
||
|
|
"loss": 5.4862,
|
||
|
|
"mean_token_accuracy": 0.16711296737194062,
|
||
|
|
"num_tokens": 10399851.0,
|
||
|
|
"step": 5640
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826428365707398,
|
||
|
|
"epoch": 0.47427011132115104,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004982829919388692,
|
||
|
|
"loss": 5.7573,
|
||
|
|
"mean_token_accuracy": 0.15294661596417428,
|
||
|
|
"num_tokens": 10410425.0,
|
||
|
|
"step": 5645
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.705338096618652,
|
||
|
|
"epoch": 0.4746901911363159,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004982792974096781,
|
||
|
|
"loss": 5.5446,
|
||
|
|
"mean_token_accuracy": 0.16691708862781524,
|
||
|
|
"num_tokens": 10418783.0,
|
||
|
|
"step": 5650
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.836835145950317,
|
||
|
|
"epoch": 0.4751102709514808,
|
||
|
|
"grad_norm": 1.1796875,
|
||
|
|
"learning_rate": 0.000498275598925209,
|
||
|
|
"loss": 5.7114,
|
||
|
|
"mean_token_accuracy": 0.15507804453372956,
|
||
|
|
"num_tokens": 10427360.0,
|
||
|
|
"step": 5655
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.856819105148316,
|
||
|
|
"epoch": 0.4755303507666457,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004982718964855277,
|
||
|
|
"loss": 5.6653,
|
||
|
|
"mean_token_accuracy": 0.1575305789709091,
|
||
|
|
"num_tokens": 10436613.0,
|
||
|
|
"step": 5660
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.742249536514282,
|
||
|
|
"epoch": 0.4759504305818105,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004982681900907,
|
||
|
|
"loss": 5.7114,
|
||
|
|
"mean_token_accuracy": 0.15877616107463838,
|
||
|
|
"num_tokens": 10445055.0,
|
||
|
|
"step": 5665
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.744962549209594,
|
||
|
|
"epoch": 0.4763705103969754,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.000498264479740791,
|
||
|
|
"loss": 5.5379,
|
||
|
|
"mean_token_accuracy": 0.16900296211242677,
|
||
|
|
"num_tokens": 10454516.0,
|
||
|
|
"step": 5670
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.830320215225219,
|
||
|
|
"epoch": 0.4767905902121403,
|
||
|
|
"grad_norm": 0.8984375,
|
||
|
|
"learning_rate": 0.0004982607654358668,
|
||
|
|
"loss": 5.6596,
|
||
|
|
"mean_token_accuracy": 0.15974192917346955,
|
||
|
|
"num_tokens": 10463771.0,
|
||
|
|
"step": 5675
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.769126272201538,
|
||
|
|
"epoch": 0.47721067002730516,
|
||
|
|
"grad_norm": 0.875,
|
||
|
|
"learning_rate": 0.000498257047175993,
|
||
|
|
"loss": 5.5908,
|
||
|
|
"mean_token_accuracy": 0.15908040702342988,
|
||
|
|
"num_tokens": 10473783.0,
|
||
|
|
"step": 5680
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.78115234375,
|
||
|
|
"epoch": 0.47763074984247006,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004982533249612357,
|
||
|
|
"loss": 5.5629,
|
||
|
|
"mean_token_accuracy": 0.16332129687070845,
|
||
|
|
"num_tokens": 10483424.0,
|
||
|
|
"step": 5685
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.69402847290039,
|
||
|
|
"epoch": 0.47805082965763496,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004982495987916607,
|
||
|
|
"loss": 5.5045,
|
||
|
|
"mean_token_accuracy": 0.1687542662024498,
|
||
|
|
"num_tokens": 10492536.0,
|
||
|
|
"step": 5690
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.782306718826294,
|
||
|
|
"epoch": 0.47847090947279985,
|
||
|
|
"grad_norm": 1.0625,
|
||
|
|
"learning_rate": 0.0004982458686673339,
|
||
|
|
"loss": 5.6148,
|
||
|
|
"mean_token_accuracy": 0.15962855368852616,
|
||
|
|
"num_tokens": 10501616.0,
|
||
|
|
"step": 5695
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8774285316467285,
|
||
|
|
"epoch": 0.4788909892879647,
|
||
|
|
"grad_norm": 1.0,
|
||
|
|
"learning_rate": 0.0004982421345883217,
|
||
|
|
"loss": 5.6435,
|
||
|
|
"mean_token_accuracy": 0.1528232589364052,
|
||
|
|
"num_tokens": 10511190.0,
|
||
|
|
"step": 5700
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.737439727783203,
|
||
|
|
"epoch": 0.4793110691031296,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004982383965546898,
|
||
|
|
"loss": 5.5899,
|
||
|
|
"mean_token_accuracy": 0.15596046000719072,
|
||
|
|
"num_tokens": 10520310.0,
|
||
|
|
"step": 5705
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.806997585296631,
|
||
|
|
"epoch": 0.4797311489182945,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004982346545665048,
|
||
|
|
"loss": 5.563,
|
||
|
|
"mean_token_accuracy": 0.16304250210523605,
|
||
|
|
"num_tokens": 10528711.0,
|
||
|
|
"step": 5710
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.757972669601441,
|
||
|
|
"epoch": 0.48015122873345933,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004982309086238328,
|
||
|
|
"loss": 5.6498,
|
||
|
|
"mean_token_accuracy": 0.15384584218263625,
|
||
|
|
"num_tokens": 10538484.0,
|
||
|
|
"step": 5715
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7749903202056885,
|
||
|
|
"epoch": 0.48057130854862423,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004982271587267403,
|
||
|
|
"loss": 5.5947,
|
||
|
|
"mean_token_accuracy": 0.15901431441307068,
|
||
|
|
"num_tokens": 10547623.0,
|
||
|
|
"step": 5720
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7751219272613525,
|
||
|
|
"epoch": 0.48099138836378913,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004982234048752935,
|
||
|
|
"loss": 5.5458,
|
||
|
|
"mean_token_accuracy": 0.16144074499607086,
|
||
|
|
"num_tokens": 10556234.0,
|
||
|
|
"step": 5725
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.856562280654908,
|
||
|
|
"epoch": 0.481411468178954,
|
||
|
|
"grad_norm": 0.94921875,
|
||
|
|
"learning_rate": 0.000498219647069559,
|
||
|
|
"loss": 5.7641,
|
||
|
|
"mean_token_accuracy": 0.1533028818666935,
|
||
|
|
"num_tokens": 10566308.0,
|
||
|
|
"step": 5730
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8091706275939945,
|
||
|
|
"epoch": 0.48183154799411887,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004982158853096035,
|
||
|
|
"loss": 5.7108,
|
||
|
|
"mean_token_accuracy": 0.15445562452077866,
|
||
|
|
"num_tokens": 10575212.0,
|
||
|
|
"step": 5735
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.755967473983764,
|
||
|
|
"epoch": 0.48225162780928377,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004982121195954935,
|
||
|
|
"loss": 5.4688,
|
||
|
|
"mean_token_accuracy": 0.1693451941013336,
|
||
|
|
"num_tokens": 10584590.0,
|
||
|
|
"step": 5740
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.736726951599121,
|
||
|
|
"epoch": 0.48267170762444866,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004982083499272957,
|
||
|
|
"loss": 5.5512,
|
||
|
|
"mean_token_accuracy": 0.16557496339082717,
|
||
|
|
"num_tokens": 10593997.0,
|
||
|
|
"step": 5745
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.806335926055908,
|
||
|
|
"epoch": 0.4830917874396135,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004982045763050768,
|
||
|
|
"loss": 5.6777,
|
||
|
|
"mean_token_accuracy": 0.157341568171978,
|
||
|
|
"num_tokens": 10603299.0,
|
||
|
|
"step": 5750
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.790657663345337,
|
||
|
|
"epoch": 0.4835118672547784,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004982007987289041,
|
||
|
|
"loss": 5.5987,
|
||
|
|
"mean_token_accuracy": 0.15882896780967712,
|
||
|
|
"num_tokens": 10613546.0,
|
||
|
|
"step": 5755
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.743067026138306,
|
||
|
|
"epoch": 0.4839319470699433,
|
||
|
|
"grad_norm": 1.0234375,
|
||
|
|
"learning_rate": 0.0004981970171988439,
|
||
|
|
"loss": 5.5707,
|
||
|
|
"mean_token_accuracy": 0.16890112310647964,
|
||
|
|
"num_tokens": 10622966.0,
|
||
|
|
"step": 5760
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.773163938522339,
|
||
|
|
"epoch": 0.48435202688510814,
|
||
|
|
"grad_norm": 1.0859375,
|
||
|
|
"learning_rate": 0.0004981932317149636,
|
||
|
|
"loss": 5.6484,
|
||
|
|
"mean_token_accuracy": 0.1565729409456253,
|
||
|
|
"num_tokens": 10633441.0,
|
||
|
|
"step": 5765
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.843293190002441,
|
||
|
|
"epoch": 0.48477210670027304,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.00049818944227733,
|
||
|
|
"loss": 5.6374,
|
||
|
|
"mean_token_accuracy": 0.15993442833423616,
|
||
|
|
"num_tokens": 10643124.0,
|
||
|
|
"step": 5770
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.831496477127075,
|
||
|
|
"epoch": 0.48519218651543794,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004981856488860105,
|
||
|
|
"loss": 5.6117,
|
||
|
|
"mean_token_accuracy": 0.1523417502641678,
|
||
|
|
"num_tokens": 10652517.0,
|
||
|
|
"step": 5775
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.804540205001831,
|
||
|
|
"epoch": 0.48561226633060284,
|
||
|
|
"grad_norm": 0.99609375,
|
||
|
|
"learning_rate": 0.0004981818515410721,
|
||
|
|
"loss": 5.6591,
|
||
|
|
"mean_token_accuracy": 0.1497793585062027,
|
||
|
|
"num_tokens": 10663352.0,
|
||
|
|
"step": 5780
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.732200670242309,
|
||
|
|
"epoch": 0.4860323461457677,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004981780502425821,
|
||
|
|
"loss": 5.6688,
|
||
|
|
"mean_token_accuracy": 0.15934486985206603,
|
||
|
|
"num_tokens": 10672430.0,
|
||
|
|
"step": 5785
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7780238628387455,
|
||
|
|
"epoch": 0.4864524259609326,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004981742449906079,
|
||
|
|
"loss": 5.6075,
|
||
|
|
"mean_token_accuracy": 0.16593022048473358,
|
||
|
|
"num_tokens": 10681908.0,
|
||
|
|
"step": 5790
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.821439170837403,
|
||
|
|
"epoch": 0.4868725057760975,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.0004981704357852168,
|
||
|
|
"loss": 5.6032,
|
||
|
|
"mean_token_accuracy": 0.16017231941223145,
|
||
|
|
"num_tokens": 10691259.0,
|
||
|
|
"step": 5795
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.739565515518189,
|
||
|
|
"epoch": 0.4872925855912623,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004981666226264764,
|
||
|
|
"loss": 5.5018,
|
||
|
|
"mean_token_accuracy": 0.16552049070596694,
|
||
|
|
"num_tokens": 10699668.0,
|
||
|
|
"step": 5800
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.741326379776001,
|
||
|
|
"epoch": 0.4877126654064272,
|
||
|
|
"grad_norm": 0.84765625,
|
||
|
|
"learning_rate": 0.0004981628055144542,
|
||
|
|
"loss": 5.5384,
|
||
|
|
"mean_token_accuracy": 0.16326582431793213,
|
||
|
|
"num_tokens": 10709146.0,
|
||
|
|
"step": 5805
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.826295614242554,
|
||
|
|
"epoch": 0.4881327452215921,
|
||
|
|
"grad_norm": 0.90234375,
|
||
|
|
"learning_rate": 0.0004981589844492177,
|
||
|
|
"loss": 5.6268,
|
||
|
|
"mean_token_accuracy": 0.1511153683066368,
|
||
|
|
"num_tokens": 10718724.0,
|
||
|
|
"step": 5810
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.774454784393311,
|
||
|
|
"epoch": 0.488552825036757,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004981551594308349,
|
||
|
|
"loss": 5.6002,
|
||
|
|
"mean_token_accuracy": 0.16163085922598838,
|
||
|
|
"num_tokens": 10728101.0,
|
||
|
|
"step": 5815
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.8604474544525145,
|
||
|
|
"epoch": 0.48897290485192185,
|
||
|
|
"grad_norm": 1.015625,
|
||
|
|
"learning_rate": 0.0004981513304593733,
|
||
|
|
"loss": 5.5894,
|
||
|
|
"mean_token_accuracy": 0.16614548563957215,
|
||
|
|
"num_tokens": 10736750.0,
|
||
|
|
"step": 5820
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.813880395889282,
|
||
|
|
"epoch": 0.48939298466708675,
|
||
|
|
"grad_norm": 0.89453125,
|
||
|
|
"learning_rate": 0.0004981474975349006,
|
||
|
|
"loss": 5.7934,
|
||
|
|
"mean_token_accuracy": 0.15620144009590148,
|
||
|
|
"num_tokens": 10746914.0,
|
||
|
|
"step": 5825
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.775779962539673,
|
||
|
|
"epoch": 0.48981306448225165,
|
||
|
|
"grad_norm": 1.0078125,
|
||
|
|
"learning_rate": 0.000498143660657485,
|
||
|
|
"loss": 5.6266,
|
||
|
|
"mean_token_accuracy": 0.160403074324131,
|
||
|
|
"num_tokens": 10755786.0,
|
||
|
|
"step": 5830
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.672336006164551,
|
||
|
|
"epoch": 0.4902331442974165,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004981398198271944,
|
||
|
|
"loss": 5.512,
|
||
|
|
"mean_token_accuracy": 0.16457450538873672,
|
||
|
|
"num_tokens": 10764821.0,
|
||
|
|
"step": 5835
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.762319898605346,
|
||
|
|
"epoch": 0.4906532241125814,
|
||
|
|
"grad_norm": 0.97265625,
|
||
|
|
"learning_rate": 0.0004981359750440968,
|
||
|
|
"loss": 5.5981,
|
||
|
|
"mean_token_accuracy": 0.15791754126548768,
|
||
|
|
"num_tokens": 10773569.0,
|
||
|
|
"step": 5840
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.703838157653808,
|
||
|
|
"epoch": 0.4910733039277463,
|
||
|
|
"grad_norm": 0.98046875,
|
||
|
|
"learning_rate": 0.0004981321263082603,
|
||
|
|
"loss": 5.5547,
|
||
|
|
"mean_token_accuracy": 0.15730964243412018,
|
||
|
|
"num_tokens": 10782298.0,
|
||
|
|
"step": 5845
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.705076360702515,
|
||
|
|
"epoch": 0.4914933837429111,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.000498128273619753,
|
||
|
|
"loss": 5.5491,
|
||
|
|
"mean_token_accuracy": 0.1628515049815178,
|
||
|
|
"num_tokens": 10792087.0,
|
||
|
|
"step": 5850
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.771277141571045,
|
||
|
|
"epoch": 0.491913463558076,
|
||
|
|
"grad_norm": 0.9140625,
|
||
|
|
"learning_rate": 0.0004981244169786433,
|
||
|
|
"loss": 5.6458,
|
||
|
|
"mean_token_accuracy": 0.15582741051912308,
|
||
|
|
"num_tokens": 10801641.0,
|
||
|
|
"step": 5855
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.861782169342041,
|
||
|
|
"epoch": 0.4923335433732409,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004981205563849994,
|
||
|
|
"loss": 5.7007,
|
||
|
|
"mean_token_accuracy": 0.15648430287837983,
|
||
|
|
"num_tokens": 10811612.0,
|
||
|
|
"step": 5860
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.788508701324463,
|
||
|
|
"epoch": 0.4927536231884058,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004981166918388897,
|
||
|
|
"loss": 5.5149,
|
||
|
|
"mean_token_accuracy": 0.16366831362247466,
|
||
|
|
"num_tokens": 10821608.0,
|
||
|
|
"step": 5865
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.720433568954467,
|
||
|
|
"epoch": 0.49317370300357066,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.0004981128233403828,
|
||
|
|
"loss": 5.4915,
|
||
|
|
"mean_token_accuracy": 0.16485851109027863,
|
||
|
|
"num_tokens": 10830679.0,
|
||
|
|
"step": 5870
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.718778944015503,
|
||
|
|
"epoch": 0.49359378281873556,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.000498108950889547,
|
||
|
|
"loss": 5.5507,
|
||
|
|
"mean_token_accuracy": 0.16066077202558518,
|
||
|
|
"num_tokens": 10839669.0,
|
||
|
|
"step": 5875
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.787919759750366,
|
||
|
|
"epoch": 0.49401386263390046,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.0004981050744864512,
|
||
|
|
"loss": 5.5387,
|
||
|
|
"mean_token_accuracy": 0.16012917906045915,
|
||
|
|
"num_tokens": 10849666.0,
|
||
|
|
"step": 5880
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.731645965576172,
|
||
|
|
"epoch": 0.4944339424490653,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004981011941311638,
|
||
|
|
"loss": 5.455,
|
||
|
|
"mean_token_accuracy": 0.1706133618950844,
|
||
|
|
"num_tokens": 10858225.0,
|
||
|
|
"step": 5885
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7152073860168455,
|
||
|
|
"epoch": 0.4948540222642302,
|
||
|
|
"grad_norm": 0.9609375,
|
||
|
|
"learning_rate": 0.0004980973098237535,
|
||
|
|
"loss": 5.5608,
|
||
|
|
"mean_token_accuracy": 0.1573803097009659,
|
||
|
|
"num_tokens": 10867466.0,
|
||
|
|
"step": 5890
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.793262910842896,
|
||
|
|
"epoch": 0.4952741020793951,
|
||
|
|
"grad_norm": 0.95703125,
|
||
|
|
"learning_rate": 0.0004980934215642894,
|
||
|
|
"loss": 5.5967,
|
||
|
|
"mean_token_accuracy": 0.1668254867196083,
|
||
|
|
"num_tokens": 10875850.0,
|
||
|
|
"step": 5895
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.676056289672852,
|
||
|
|
"epoch": 0.49569418189456,
|
||
|
|
"grad_norm": 0.98828125,
|
||
|
|
"learning_rate": 0.00049808952935284,
|
||
|
|
"loss": 5.5231,
|
||
|
|
"mean_token_accuracy": 0.16948444843292237,
|
||
|
|
"num_tokens": 10885154.0,
|
||
|
|
"step": 5900
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.739302301406861,
|
||
|
|
"epoch": 0.49611426170972484,
|
||
|
|
"grad_norm": 0.984375,
|
||
|
|
"learning_rate": 0.0004980856331894747,
|
||
|
|
"loss": 5.6296,
|
||
|
|
"mean_token_accuracy": 0.16090053021907808,
|
||
|
|
"num_tokens": 10894080.0,
|
||
|
|
"step": 5905
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7569280624389645,
|
||
|
|
"epoch": 0.49653434152488973,
|
||
|
|
"grad_norm": 0.9453125,
|
||
|
|
"learning_rate": 0.0004980817330742621,
|
||
|
|
"loss": 5.6161,
|
||
|
|
"mean_token_accuracy": 0.15483176559209824,
|
||
|
|
"num_tokens": 10903248.0,
|
||
|
|
"step": 5910
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.768988418579101,
|
||
|
|
"epoch": 0.49695442134005463,
|
||
|
|
"grad_norm": 0.91015625,
|
||
|
|
"learning_rate": 0.0004980778290072716,
|
||
|
|
"loss": 5.5804,
|
||
|
|
"mean_token_accuracy": 0.16294265836477279,
|
||
|
|
"num_tokens": 10912939.0,
|
||
|
|
"step": 5915
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.777530717849731,
|
||
|
|
"epoch": 0.4973745011552195,
|
||
|
|
"grad_norm": 0.9765625,
|
||
|
|
"learning_rate": 0.0004980739209885722,
|
||
|
|
"loss": 5.6127,
|
||
|
|
"mean_token_accuracy": 0.16438234001398086,
|
||
|
|
"num_tokens": 10921505.0,
|
||
|
|
"step": 5920
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.802098226547241,
|
||
|
|
"epoch": 0.49779458097038437,
|
||
|
|
"grad_norm": 0.9375,
|
||
|
|
"learning_rate": 0.0004980700090182331,
|
||
|
|
"loss": 5.6819,
|
||
|
|
"mean_token_accuracy": 0.16335346847772597,
|
||
|
|
"num_tokens": 10931861.0,
|
||
|
|
"step": 5925
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.83542947769165,
|
||
|
|
"epoch": 0.49821466078554927,
|
||
|
|
"grad_norm": 0.921875,
|
||
|
|
"learning_rate": 0.0004980660930963238,
|
||
|
|
"loss": 5.5848,
|
||
|
|
"mean_token_accuracy": 0.16074420511722565,
|
||
|
|
"num_tokens": 10940810.0,
|
||
|
|
"step": 5930
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.723906135559082,
|
||
|
|
"epoch": 0.4986347406007141,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004980621732229133,
|
||
|
|
"loss": 5.4722,
|
||
|
|
"mean_token_accuracy": 0.16402249783277512,
|
||
|
|
"num_tokens": 10949514.0,
|
||
|
|
"step": 5935
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.749081373214722,
|
||
|
|
"epoch": 0.499054820415879,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004980582493980714,
|
||
|
|
"loss": 5.6742,
|
||
|
|
"mean_token_accuracy": 0.1556909427046776,
|
||
|
|
"num_tokens": 10959161.0,
|
||
|
|
"step": 5940
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.750719594955444,
|
||
|
|
"epoch": 0.4994749002310439,
|
||
|
|
"grad_norm": 0.890625,
|
||
|
|
"learning_rate": 0.0004980543216218674,
|
||
|
|
"loss": 5.5569,
|
||
|
|
"mean_token_accuracy": 0.17051900774240494,
|
||
|
|
"num_tokens": 10968983.0,
|
||
|
|
"step": 5945
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.795907783508301,
|
||
|
|
"epoch": 0.4998949800462088,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004980503898943711,
|
||
|
|
"loss": 5.6755,
|
||
|
|
"mean_token_accuracy": 0.16463214308023452,
|
||
|
|
"num_tokens": 10978044.0,
|
||
|
|
"step": 5950
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.818535089492798,
|
||
|
|
"epoch": 0.5003150598613737,
|
||
|
|
"grad_norm": 0.90625,
|
||
|
|
"learning_rate": 0.0004980464542156519,
|
||
|
|
"loss": 5.5895,
|
||
|
|
"mean_token_accuracy": 0.16786763817071915,
|
||
|
|
"num_tokens": 10986980.0,
|
||
|
|
"step": 5955
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.744042301177979,
|
||
|
|
"epoch": 0.5007351396765385,
|
||
|
|
"grad_norm": 0.953125,
|
||
|
|
"learning_rate": 0.0004980425145857796,
|
||
|
|
"loss": 5.5404,
|
||
|
|
"mean_token_accuracy": 0.17190210670232772,
|
||
|
|
"num_tokens": 10995163.0,
|
||
|
|
"step": 5960
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.6839663028717045,
|
||
|
|
"epoch": 0.5011552194917034,
|
||
|
|
"grad_norm": 0.92578125,
|
||
|
|
"learning_rate": 0.000498038571004824,
|
||
|
|
"loss": 5.4658,
|
||
|
|
"mean_token_accuracy": 0.1701178327202797,
|
||
|
|
"num_tokens": 11003722.0,
|
||
|
|
"step": 5965
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.658802843093872,
|
||
|
|
"epoch": 0.5015752993068683,
|
||
|
|
"grad_norm": 0.93359375,
|
||
|
|
"learning_rate": 0.0004980346234728549,
|
||
|
|
"loss": 5.5459,
|
||
|
|
"mean_token_accuracy": 0.1696319282054901,
|
||
|
|
"num_tokens": 11013176.0,
|
||
|
|
"step": 5970
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.7755608558654785,
|
||
|
|
"epoch": 0.5019953791220332,
|
||
|
|
"grad_norm": 0.94140625,
|
||
|
|
"learning_rate": 0.0004980306719899424,
|
||
|
|
"loss": 5.601,
|
||
|
|
"mean_token_accuracy": 0.16234323978424073,
|
||
|
|
"num_tokens": 11022636.0,
|
||
|
|
"step": 5975
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.711779022216797,
|
||
|
|
"epoch": 0.5024154589371981,
|
||
|
|
"grad_norm": 0.96875,
|
||
|
|
"learning_rate": 0.0004980267165561564,
|
||
|
|
"loss": 5.5409,
|
||
|
|
"mean_token_accuracy": 0.16729752868413925,
|
||
|
|
"num_tokens": 11031896.0,
|
||
|
|
"step": 5980
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.725300073623657,
|
||
|
|
"epoch": 0.502835538752363,
|
||
|
|
"grad_norm": 0.91796875,
|
||
|
|
"learning_rate": 0.0004980227571715669,
|
||
|
|
"loss": 5.579,
|
||
|
|
"mean_token_accuracy": 0.15976378172636033,
|
||
|
|
"num_tokens": 11040802.0,
|
||
|
|
"step": 5985
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.731253290176392,
|
||
|
|
"epoch": 0.5032556185675279,
|
||
|
|
"grad_norm": 0.96484375,
|
||
|
|
"learning_rate": 0.0004980187938362441,
|
||
|
|
"loss": 5.5153,
|
||
|
|
"mean_token_accuracy": 0.1588967353105545,
|
||
|
|
"num_tokens": 11049701.0,
|
||
|
|
"step": 5990
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.786366033554077,
|
||
|
|
"epoch": 0.5036756983826927,
|
||
|
|
"grad_norm": 0.9296875,
|
||
|
|
"learning_rate": 0.0004980148265502581,
|
||
|
|
"loss": 5.694,
|
||
|
|
"mean_token_accuracy": 0.15498168617486954,
|
||
|
|
"num_tokens": 11059555.0,
|
||
|
|
"step": 5995
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"entropy": 5.793335866928101,
|
||
|
|
"epoch": 0.5040957781978576,
|
||
|
|
"grad_norm": 0.9921875,
|
||
|
|
"learning_rate": 0.0004980108553136795,
|
||
|
|
"loss": 5.6141,
|
||
|
|
"mean_token_accuracy": 0.16208730340003968,
|
||
|
|
"num_tokens": 11068940.0,
|
||
|
|
"step": 6000
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5040957781978576,
|
||
|
|
"eval_entropy": 5.5702669805797465,
|
||
|
|
"eval_loss": 5.591900825500488,
|
||
|
|
"eval_mean_token_accuracy": 0.1687953193199262,
|
||
|
|
"eval_num_tokens": 11068940.0,
|
||
|
|
"eval_runtime": 21.0876,
|
||
|
|
"eval_samples_per_second": 1771.942,
|
||
|
|
"eval_steps_per_second": 221.505,
|
||
|
|
"step": 6000
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 5,
|
||
|
|
"max_steps": 119020,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 10,
|
||
|
|
"save_steps": 3000,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": false
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 2398509142179840.0,
|
||
|
|
"train_batch_size": 16,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|