Model: fpadovani/swe-latn-100mb-after-ppt-Dp-100mb-ckpt500_seed3407 Source: Original Platform
30090 lines
825 KiB
JSON
30090 lines
825 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.2601974375131275,
|
|
"eval_steps": 3000,
|
|
"global_step": 15000,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"entropy": 7.253151750564575,
|
|
"epoch": 0.0004200798151648813,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 2e-06,
|
|
"loss": 6.8984,
|
|
"mean_token_accuracy": 0.09029425084590911,
|
|
"num_tokens": 8348.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"entropy": 7.219348526000976,
|
|
"epoch": 0.0008401596303297626,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 4.5e-06,
|
|
"loss": 6.9563,
|
|
"mean_token_accuracy": 0.0939315177500248,
|
|
"num_tokens": 17465.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"entropy": 7.203219985961914,
|
|
"epoch": 0.001260239445494644,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 7e-06,
|
|
"loss": 6.9004,
|
|
"mean_token_accuracy": 0.08940818756818772,
|
|
"num_tokens": 26627.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"entropy": 7.224360418319702,
|
|
"epoch": 0.0016803192606595252,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 9.5e-06,
|
|
"loss": 6.9259,
|
|
"mean_token_accuracy": 0.09512931853532791,
|
|
"num_tokens": 36069.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"entropy": 7.186052703857422,
|
|
"epoch": 0.002100399075824407,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 1.2e-05,
|
|
"loss": 6.8641,
|
|
"mean_token_accuracy": 0.09390396177768708,
|
|
"num_tokens": 44967.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"entropy": 7.151274633407593,
|
|
"epoch": 0.002520478890989288,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 1.4500000000000002e-05,
|
|
"loss": 6.964,
|
|
"mean_token_accuracy": 0.08688623458147049,
|
|
"num_tokens": 55132.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"entropy": 7.053877639770508,
|
|
"epoch": 0.0029405587061541692,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 1.7000000000000003e-05,
|
|
"loss": 6.7637,
|
|
"mean_token_accuracy": 0.09662552699446678,
|
|
"num_tokens": 65141.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"entropy": 7.099168729782105,
|
|
"epoch": 0.0033606385213190504,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 1.95e-05,
|
|
"loss": 6.8446,
|
|
"mean_token_accuracy": 0.09567792639136315,
|
|
"num_tokens": 74007.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"entropy": 7.140014743804931,
|
|
"epoch": 0.003780718336483932,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 2.2e-05,
|
|
"loss": 6.8569,
|
|
"mean_token_accuracy": 0.09555562734603881,
|
|
"num_tokens": 83736.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"entropy": 7.1646524429321286,
|
|
"epoch": 0.004200798151648814,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 2.4500000000000003e-05,
|
|
"loss": 6.8505,
|
|
"mean_token_accuracy": 0.09592381715774537,
|
|
"num_tokens": 92525.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"entropy": 7.107996654510498,
|
|
"epoch": 0.004620877966813695,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 2.7e-05,
|
|
"loss": 6.7755,
|
|
"mean_token_accuracy": 0.09664912968873977,
|
|
"num_tokens": 102015.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"entropy": 7.137420606613159,
|
|
"epoch": 0.005040957781978576,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 2.95e-05,
|
|
"loss": 6.9768,
|
|
"mean_token_accuracy": 0.0867392435669899,
|
|
"num_tokens": 110887.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"entropy": 7.134914398193359,
|
|
"epoch": 0.005461037597143457,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 3.2e-05,
|
|
"loss": 6.9024,
|
|
"mean_token_accuracy": 0.08935272470116615,
|
|
"num_tokens": 120442.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"entropy": 7.1443713188171385,
|
|
"epoch": 0.0058811174123083385,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 3.4500000000000005e-05,
|
|
"loss": 6.8989,
|
|
"mean_token_accuracy": 0.09712273105978966,
|
|
"num_tokens": 129297.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"entropy": 7.160055351257324,
|
|
"epoch": 0.00630119722747322,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 3.7e-05,
|
|
"loss": 6.8507,
|
|
"mean_token_accuracy": 0.09877990111708641,
|
|
"num_tokens": 138305.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"entropy": 7.141705083847046,
|
|
"epoch": 0.006721277042638101,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 3.95e-05,
|
|
"loss": 6.8372,
|
|
"mean_token_accuracy": 0.09292416870594025,
|
|
"num_tokens": 147640.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"entropy": 7.002462768554688,
|
|
"epoch": 0.007141356857802983,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 4.2000000000000004e-05,
|
|
"loss": 6.75,
|
|
"mean_token_accuracy": 0.09611514061689377,
|
|
"num_tokens": 157633.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"entropy": 7.100855731964112,
|
|
"epoch": 0.007561436672967864,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 4.45e-05,
|
|
"loss": 6.7673,
|
|
"mean_token_accuracy": 0.08827547580003739,
|
|
"num_tokens": 167984.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"entropy": 7.129158973693848,
|
|
"epoch": 0.007981516488132745,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 4.7000000000000004e-05,
|
|
"loss": 6.8948,
|
|
"mean_token_accuracy": 0.099730733782053,
|
|
"num_tokens": 176984.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"entropy": 7.069277429580689,
|
|
"epoch": 0.008401596303297627,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 4.9500000000000004e-05,
|
|
"loss": 6.7542,
|
|
"mean_token_accuracy": 0.0975344181060791,
|
|
"num_tokens": 185931.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"entropy": 7.183401107788086,
|
|
"epoch": 0.008821676118462508,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 5.2e-05,
|
|
"loss": 6.8312,
|
|
"mean_token_accuracy": 0.09061248749494552,
|
|
"num_tokens": 195065.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"entropy": 7.100264024734497,
|
|
"epoch": 0.00924175593362739,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 5.45e-05,
|
|
"loss": 6.8219,
|
|
"mean_token_accuracy": 0.09686729088425636,
|
|
"num_tokens": 203687.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"entropy": 7.042130661010742,
|
|
"epoch": 0.00966183574879227,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 5.7e-05,
|
|
"loss": 6.8491,
|
|
"mean_token_accuracy": 0.09541062936186791,
|
|
"num_tokens": 212847.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"entropy": 7.127160167694091,
|
|
"epoch": 0.010081915563957152,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 5.9499999999999996e-05,
|
|
"loss": 6.8242,
|
|
"mean_token_accuracy": 0.0955698125064373,
|
|
"num_tokens": 222593.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"entropy": 7.0331028461456295,
|
|
"epoch": 0.010501995379122032,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 6.2e-05,
|
|
"loss": 6.711,
|
|
"mean_token_accuracy": 0.0981454961001873,
|
|
"num_tokens": 231174.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"entropy": 6.9479889392852785,
|
|
"epoch": 0.010922075194286915,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 6.450000000000001e-05,
|
|
"loss": 6.663,
|
|
"mean_token_accuracy": 0.10388810336589813,
|
|
"num_tokens": 239833.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"entropy": 7.190658855438232,
|
|
"epoch": 0.011342155009451797,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 6.7e-05,
|
|
"loss": 6.951,
|
|
"mean_token_accuracy": 0.0949922852218151,
|
|
"num_tokens": 248794.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"entropy": 7.251979541778565,
|
|
"epoch": 0.011762234824616677,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 6.950000000000001e-05,
|
|
"loss": 6.9449,
|
|
"mean_token_accuracy": 0.09376412332057953,
|
|
"num_tokens": 257123.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"entropy": 7.008625459671021,
|
|
"epoch": 0.012182314639781559,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 7.2e-05,
|
|
"loss": 6.6678,
|
|
"mean_token_accuracy": 0.10136394873261452,
|
|
"num_tokens": 266088.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"entropy": 7.118732166290283,
|
|
"epoch": 0.01260239445494644,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 7.45e-05,
|
|
"loss": 6.9922,
|
|
"mean_token_accuracy": 0.08856561928987502,
|
|
"num_tokens": 276074.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"entropy": 7.265724229812622,
|
|
"epoch": 0.013022474270111321,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 7.7e-05,
|
|
"loss": 6.8415,
|
|
"mean_token_accuracy": 0.09575222656130791,
|
|
"num_tokens": 285280.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"entropy": 7.09201340675354,
|
|
"epoch": 0.013442554085276202,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 7.950000000000001e-05,
|
|
"loss": 6.9508,
|
|
"mean_token_accuracy": 0.08855971023440361,
|
|
"num_tokens": 296115.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"entropy": 7.172540521621704,
|
|
"epoch": 0.013862633900441084,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 8.2e-05,
|
|
"loss": 6.7833,
|
|
"mean_token_accuracy": 0.09475113973021507,
|
|
"num_tokens": 305483.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"entropy": 7.142146444320678,
|
|
"epoch": 0.014282713715605966,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 8.450000000000001e-05,
|
|
"loss": 6.798,
|
|
"mean_token_accuracy": 0.09611742347478866,
|
|
"num_tokens": 314000.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"entropy": 7.191121196746826,
|
|
"epoch": 0.014702793530770846,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 8.7e-05,
|
|
"loss": 6.8719,
|
|
"mean_token_accuracy": 0.09523176699876786,
|
|
"num_tokens": 323667.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"entropy": 7.111018800735474,
|
|
"epoch": 0.015122873345935728,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 8.95e-05,
|
|
"loss": 6.9126,
|
|
"mean_token_accuracy": 0.09633364677429199,
|
|
"num_tokens": 332695.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"entropy": 7.152137660980225,
|
|
"epoch": 0.015542953161100609,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 9.2e-05,
|
|
"loss": 6.7081,
|
|
"mean_token_accuracy": 0.09874472171068191,
|
|
"num_tokens": 342428.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"entropy": 7.018444013595581,
|
|
"epoch": 0.01596303297626549,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 9.45e-05,
|
|
"loss": 6.9196,
|
|
"mean_token_accuracy": 0.09128761291503906,
|
|
"num_tokens": 353587.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"entropy": 7.154490900039673,
|
|
"epoch": 0.01638311279143037,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 9.7e-05,
|
|
"loss": 6.8259,
|
|
"mean_token_accuracy": 0.09795344024896621,
|
|
"num_tokens": 362997.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"entropy": 7.170652723312378,
|
|
"epoch": 0.016803192606595255,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 9.95e-05,
|
|
"loss": 6.8563,
|
|
"mean_token_accuracy": 0.09546189531683921,
|
|
"num_tokens": 372346.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"entropy": 7.110174036026001,
|
|
"epoch": 0.017223272421760135,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000102,
|
|
"loss": 6.741,
|
|
"mean_token_accuracy": 0.09936807751655578,
|
|
"num_tokens": 381575.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"entropy": 7.106418037414551,
|
|
"epoch": 0.017643352236925015,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00010449999999999999,
|
|
"loss": 6.8292,
|
|
"mean_token_accuracy": 0.09433561563491821,
|
|
"num_tokens": 390706.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"entropy": 7.192156839370727,
|
|
"epoch": 0.018063432052089896,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000107,
|
|
"loss": 6.8516,
|
|
"mean_token_accuracy": 0.0938378892838955,
|
|
"num_tokens": 400000.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"entropy": 6.9766045093536375,
|
|
"epoch": 0.01848351186725478,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0001095,
|
|
"loss": 6.8445,
|
|
"mean_token_accuracy": 0.10025399252772331,
|
|
"num_tokens": 409447.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"entropy": 7.228094434738159,
|
|
"epoch": 0.01890359168241966,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000112,
|
|
"loss": 6.7628,
|
|
"mean_token_accuracy": 0.09841207265853882,
|
|
"num_tokens": 418417.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"entropy": 6.980171537399292,
|
|
"epoch": 0.01932367149758454,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0001145,
|
|
"loss": 6.7865,
|
|
"mean_token_accuracy": 0.10002906545996666,
|
|
"num_tokens": 427619.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"entropy": 7.174957370758056,
|
|
"epoch": 0.019743751312749424,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.00011700000000000001,
|
|
"loss": 6.8595,
|
|
"mean_token_accuracy": 0.09506258964538575,
|
|
"num_tokens": 437931.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"entropy": 7.124919366836548,
|
|
"epoch": 0.020163831127914304,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00011949999999999999,
|
|
"loss": 6.8624,
|
|
"mean_token_accuracy": 0.10153809040784836,
|
|
"num_tokens": 447595.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"entropy": 7.076248693466186,
|
|
"epoch": 0.020583910943079185,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000122,
|
|
"loss": 6.7534,
|
|
"mean_token_accuracy": 0.09595257192850112,
|
|
"num_tokens": 457062.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"entropy": 7.11194372177124,
|
|
"epoch": 0.021003990758244065,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0001245,
|
|
"loss": 6.8486,
|
|
"mean_token_accuracy": 0.09663526639342308,
|
|
"num_tokens": 466191.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"entropy": 7.119431734085083,
|
|
"epoch": 0.02142407057340895,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000127,
|
|
"loss": 6.8411,
|
|
"mean_token_accuracy": 0.09689914286136628,
|
|
"num_tokens": 475693.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"entropy": 7.101942634582519,
|
|
"epoch": 0.02184415038857383,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0001295,
|
|
"loss": 6.8733,
|
|
"mean_token_accuracy": 0.0926995851099491,
|
|
"num_tokens": 485173.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"entropy": 7.02587628364563,
|
|
"epoch": 0.02226423020373871,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000132,
|
|
"loss": 6.7261,
|
|
"mean_token_accuracy": 0.1030467577278614,
|
|
"num_tokens": 493985.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"entropy": 7.204363059997559,
|
|
"epoch": 0.022684310018903593,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00013450000000000002,
|
|
"loss": 6.8283,
|
|
"mean_token_accuracy": 0.09744107499718666,
|
|
"num_tokens": 502837.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"entropy": 7.018724775314331,
|
|
"epoch": 0.023104389834068473,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00013700000000000002,
|
|
"loss": 6.7299,
|
|
"mean_token_accuracy": 0.10230938643217087,
|
|
"num_tokens": 511503.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"entropy": 7.203450679779053,
|
|
"epoch": 0.023524469649233354,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0001395,
|
|
"loss": 6.9591,
|
|
"mean_token_accuracy": 0.09394779950380325,
|
|
"num_tokens": 521499.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"entropy": 7.085290002822876,
|
|
"epoch": 0.023944549464398234,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00014199999999999998,
|
|
"loss": 6.7304,
|
|
"mean_token_accuracy": 0.09568566456437111,
|
|
"num_tokens": 530067.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"entropy": 7.13321099281311,
|
|
"epoch": 0.024364629279563118,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0001445,
|
|
"loss": 6.7422,
|
|
"mean_token_accuracy": 0.10064690634608268,
|
|
"num_tokens": 538559.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"entropy": 7.135575151443481,
|
|
"epoch": 0.024784709094728,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000147,
|
|
"loss": 6.9403,
|
|
"mean_token_accuracy": 0.09277286529541015,
|
|
"num_tokens": 547288.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"entropy": 7.132419538497925,
|
|
"epoch": 0.02520478890989288,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0001495,
|
|
"loss": 6.8165,
|
|
"mean_token_accuracy": 0.09548124819993972,
|
|
"num_tokens": 557269.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"entropy": 7.107891798019409,
|
|
"epoch": 0.025624868725057762,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000152,
|
|
"loss": 6.8392,
|
|
"mean_token_accuracy": 0.09125733524560928,
|
|
"num_tokens": 567280.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"entropy": 7.009502935409546,
|
|
"epoch": 0.026044948540222643,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00015450000000000001,
|
|
"loss": 6.6443,
|
|
"mean_token_accuracy": 0.0992837019264698,
|
|
"num_tokens": 576609.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"entropy": 6.9137735843658445,
|
|
"epoch": 0.026465028355387523,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000157,
|
|
"loss": 6.627,
|
|
"mean_token_accuracy": 0.1031131848692894,
|
|
"num_tokens": 586053.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"entropy": 7.00633134841919,
|
|
"epoch": 0.026885108170552403,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0001595,
|
|
"loss": 6.8295,
|
|
"mean_token_accuracy": 0.09749070778489113,
|
|
"num_tokens": 594649.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"entropy": 7.084966850280762,
|
|
"epoch": 0.027305187985717287,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000162,
|
|
"loss": 6.7142,
|
|
"mean_token_accuracy": 0.09451463893055916,
|
|
"num_tokens": 603445.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"entropy": 7.088579750061035,
|
|
"epoch": 0.027725267800882167,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00016450000000000001,
|
|
"loss": 6.8658,
|
|
"mean_token_accuracy": 0.09315285831689835,
|
|
"num_tokens": 613611.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"entropy": 7.3623809814453125,
|
|
"epoch": 0.028145347616047048,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00016700000000000002,
|
|
"loss": 7.0713,
|
|
"mean_token_accuracy": 0.0919966921210289,
|
|
"num_tokens": 623024.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"entropy": 6.999190092086792,
|
|
"epoch": 0.02856542743121193,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00016950000000000003,
|
|
"loss": 6.7385,
|
|
"mean_token_accuracy": 0.1017606370151043,
|
|
"num_tokens": 631624.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"entropy": 6.994140338897705,
|
|
"epoch": 0.028985507246376812,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00017199999999999998,
|
|
"loss": 6.6633,
|
|
"mean_token_accuracy": 0.10479742139577866,
|
|
"num_tokens": 640473.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"entropy": 7.0762580871582035,
|
|
"epoch": 0.029405587061541692,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00017449999999999999,
|
|
"loss": 6.9006,
|
|
"mean_token_accuracy": 0.09429975003004074,
|
|
"num_tokens": 649692.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"entropy": 7.142873811721802,
|
|
"epoch": 0.029825666876706573,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000177,
|
|
"loss": 6.8377,
|
|
"mean_token_accuracy": 0.09720863476395607,
|
|
"num_tokens": 658236.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"entropy": 6.952843904495239,
|
|
"epoch": 0.030245746691871456,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0001795,
|
|
"loss": 6.6251,
|
|
"mean_token_accuracy": 0.09688405320048332,
|
|
"num_tokens": 667175.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"entropy": 7.14975872039795,
|
|
"epoch": 0.030665826507036337,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000182,
|
|
"loss": 6.9438,
|
|
"mean_token_accuracy": 0.09247877895832061,
|
|
"num_tokens": 676456.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"entropy": 7.224644327163697,
|
|
"epoch": 0.031085906322201217,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0001845,
|
|
"loss": 6.926,
|
|
"mean_token_accuracy": 0.08845363929867744,
|
|
"num_tokens": 686881.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"entropy": 6.932204675674439,
|
|
"epoch": 0.0315059861373661,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000187,
|
|
"loss": 6.6985,
|
|
"mean_token_accuracy": 0.09394535645842553,
|
|
"num_tokens": 696045.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"entropy": 6.965549278259277,
|
|
"epoch": 0.03192606595253098,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0001895,
|
|
"loss": 6.6653,
|
|
"mean_token_accuracy": 0.10160319805145264,
|
|
"num_tokens": 704729.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"entropy": 6.985969495773316,
|
|
"epoch": 0.032346145767695865,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000192,
|
|
"loss": 6.7426,
|
|
"mean_token_accuracy": 0.0950203962624073,
|
|
"num_tokens": 714331.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"entropy": 7.101364660263061,
|
|
"epoch": 0.03276622558286074,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0001945,
|
|
"loss": 6.6912,
|
|
"mean_token_accuracy": 0.10189466029405594,
|
|
"num_tokens": 722788.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"entropy": 7.007110738754273,
|
|
"epoch": 0.033186305398025626,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00019700000000000002,
|
|
"loss": 6.7849,
|
|
"mean_token_accuracy": 0.10113223120570183,
|
|
"num_tokens": 731417.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"entropy": 7.0138044357299805,
|
|
"epoch": 0.03360638521319051,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.00019950000000000002,
|
|
"loss": 6.7534,
|
|
"mean_token_accuracy": 0.09411026313900947,
|
|
"num_tokens": 741034.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"entropy": 7.00489068031311,
|
|
"epoch": 0.034026465028355386,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000202,
|
|
"loss": 6.7516,
|
|
"mean_token_accuracy": 0.09877323731780052,
|
|
"num_tokens": 749596.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"entropy": 7.017527103424072,
|
|
"epoch": 0.03444654484352027,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00020449999999999998,
|
|
"loss": 6.7218,
|
|
"mean_token_accuracy": 0.09416642934083938,
|
|
"num_tokens": 758931.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"entropy": 6.871344518661499,
|
|
"epoch": 0.03486662465868515,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000207,
|
|
"loss": 6.6216,
|
|
"mean_token_accuracy": 0.10105381533503532,
|
|
"num_tokens": 767534.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"entropy": 6.960817480087281,
|
|
"epoch": 0.03528670447385003,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0002095,
|
|
"loss": 6.6761,
|
|
"mean_token_accuracy": 0.10064006224274635,
|
|
"num_tokens": 776456.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"entropy": 7.008622884750366,
|
|
"epoch": 0.035706784289014915,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000212,
|
|
"loss": 6.7646,
|
|
"mean_token_accuracy": 0.09697613269090652,
|
|
"num_tokens": 786172.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"entropy": 6.960406351089477,
|
|
"epoch": 0.03612686410417979,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0002145,
|
|
"loss": 6.6658,
|
|
"mean_token_accuracy": 0.10456070601940155,
|
|
"num_tokens": 795081.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"entropy": 6.916972398757935,
|
|
"epoch": 0.036546943919344675,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00021700000000000002,
|
|
"loss": 6.6979,
|
|
"mean_token_accuracy": 0.09141508191823959,
|
|
"num_tokens": 804259.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"entropy": 6.980181550979614,
|
|
"epoch": 0.03696702373450956,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0002195,
|
|
"loss": 6.7251,
|
|
"mean_token_accuracy": 0.0985159382224083,
|
|
"num_tokens": 813463.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"entropy": 6.952114248275757,
|
|
"epoch": 0.037387103549674436,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000222,
|
|
"loss": 6.6346,
|
|
"mean_token_accuracy": 0.10426531285047531,
|
|
"num_tokens": 823029.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"entropy": 6.91859712600708,
|
|
"epoch": 0.03780718336483932,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002245,
|
|
"loss": 6.7327,
|
|
"mean_token_accuracy": 0.0944428451359272,
|
|
"num_tokens": 832902.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"entropy": 6.92227520942688,
|
|
"epoch": 0.0382272631800042,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.00022700000000000002,
|
|
"loss": 6.6724,
|
|
"mean_token_accuracy": 0.10073406398296356,
|
|
"num_tokens": 842162.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"entropy": 6.979531192779541,
|
|
"epoch": 0.03864734299516908,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.00022950000000000002,
|
|
"loss": 6.7091,
|
|
"mean_token_accuracy": 0.0995998091995716,
|
|
"num_tokens": 852328.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"entropy": 6.8678590774536135,
|
|
"epoch": 0.039067422810333964,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00023200000000000003,
|
|
"loss": 6.6831,
|
|
"mean_token_accuracy": 0.1023336872458458,
|
|
"num_tokens": 860929.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"entropy": 7.0033402919769285,
|
|
"epoch": 0.03948750262549885,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00023449999999999998,
|
|
"loss": 6.7492,
|
|
"mean_token_accuracy": 0.09593449011445046,
|
|
"num_tokens": 869144.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"entropy": 7.003532409667969,
|
|
"epoch": 0.039907582440663725,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000237,
|
|
"loss": 6.705,
|
|
"mean_token_accuracy": 0.10385636389255523,
|
|
"num_tokens": 877447.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"entropy": 6.86921706199646,
|
|
"epoch": 0.04032766225582861,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0002395,
|
|
"loss": 6.6601,
|
|
"mean_token_accuracy": 0.09642177075147629,
|
|
"num_tokens": 887020.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"entropy": 6.996332120895386,
|
|
"epoch": 0.040747742070993485,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000242,
|
|
"loss": 6.7054,
|
|
"mean_token_accuracy": 0.0975713811814785,
|
|
"num_tokens": 895937.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"entropy": 6.852901887893677,
|
|
"epoch": 0.04116782188615837,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0002445,
|
|
"loss": 6.7267,
|
|
"mean_token_accuracy": 0.09650165066123009,
|
|
"num_tokens": 905446.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"entropy": 6.893301010131836,
|
|
"epoch": 0.04158790170132325,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000247,
|
|
"loss": 6.6036,
|
|
"mean_token_accuracy": 0.10643761828541756,
|
|
"num_tokens": 914547.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"entropy": 6.915640449523925,
|
|
"epoch": 0.04200798151648813,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0002495,
|
|
"loss": 6.6263,
|
|
"mean_token_accuracy": 0.10527556240558625,
|
|
"num_tokens": 922900.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"entropy": 6.947235059738159,
|
|
"epoch": 0.042428061331653014,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000252,
|
|
"loss": 6.6686,
|
|
"mean_token_accuracy": 0.10355583727359771,
|
|
"num_tokens": 930876.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"entropy": 6.88210015296936,
|
|
"epoch": 0.0428481411468179,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0002545,
|
|
"loss": 6.7087,
|
|
"mean_token_accuracy": 0.10312066823244095,
|
|
"num_tokens": 939871.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"entropy": 6.947447443008423,
|
|
"epoch": 0.043268220961982774,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000257,
|
|
"loss": 6.695,
|
|
"mean_token_accuracy": 0.10180827602744102,
|
|
"num_tokens": 948673.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"entropy": 6.816449880599976,
|
|
"epoch": 0.04368830077714766,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0002595,
|
|
"loss": 6.6488,
|
|
"mean_token_accuracy": 0.09842450320720672,
|
|
"num_tokens": 957603.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"entropy": 6.928069686889648,
|
|
"epoch": 0.04410838059231254,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000262,
|
|
"loss": 6.7274,
|
|
"mean_token_accuracy": 0.09575201719999313,
|
|
"num_tokens": 967731.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"entropy": 6.940513849258423,
|
|
"epoch": 0.04452846040747742,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00026450000000000003,
|
|
"loss": 6.7098,
|
|
"mean_token_accuracy": 0.10156730636954307,
|
|
"num_tokens": 977427.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"entropy": 6.883533573150634,
|
|
"epoch": 0.0449485402226423,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.00026700000000000004,
|
|
"loss": 6.7126,
|
|
"mean_token_accuracy": 0.09694371595978737,
|
|
"num_tokens": 986758.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"entropy": 7.054216384887695,
|
|
"epoch": 0.045368620037807186,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.00026950000000000005,
|
|
"loss": 6.7073,
|
|
"mean_token_accuracy": 0.10619494765996933,
|
|
"num_tokens": 996377.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"entropy": 6.823930788040161,
|
|
"epoch": 0.04578869985297206,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00027200000000000005,
|
|
"loss": 6.7762,
|
|
"mean_token_accuracy": 0.09864854142069816,
|
|
"num_tokens": 1006483.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"entropy": 6.839679384231568,
|
|
"epoch": 0.04620877966813695,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0002745,
|
|
"loss": 6.6608,
|
|
"mean_token_accuracy": 0.09898171871900559,
|
|
"num_tokens": 1016132.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"entropy": 6.886328125,
|
|
"epoch": 0.04662885948330183,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000277,
|
|
"loss": 6.5878,
|
|
"mean_token_accuracy": 0.10534627884626388,
|
|
"num_tokens": 1024970.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"entropy": 6.892021656036377,
|
|
"epoch": 0.04704893929846671,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0002795,
|
|
"loss": 6.7043,
|
|
"mean_token_accuracy": 0.0967423141002655,
|
|
"num_tokens": 1034335.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"entropy": 6.914703607559204,
|
|
"epoch": 0.04746901911363159,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.00028199999999999997,
|
|
"loss": 6.756,
|
|
"mean_token_accuracy": 0.10775318518280982,
|
|
"num_tokens": 1043954.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"entropy": 6.942829847335815,
|
|
"epoch": 0.04788909892879647,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0002845,
|
|
"loss": 6.6882,
|
|
"mean_token_accuracy": 0.10058957412838936,
|
|
"num_tokens": 1053554.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"entropy": 6.854119396209716,
|
|
"epoch": 0.04830917874396135,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.000287,
|
|
"loss": 6.6366,
|
|
"mean_token_accuracy": 0.10385002046823502,
|
|
"num_tokens": 1062008.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"entropy": 6.868479824066162,
|
|
"epoch": 0.048729258559126236,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0002895,
|
|
"loss": 6.7048,
|
|
"mean_token_accuracy": 0.10346106439828873,
|
|
"num_tokens": 1070740.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"entropy": 6.8440343856811525,
|
|
"epoch": 0.04914933837429111,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000292,
|
|
"loss": 6.7057,
|
|
"mean_token_accuracy": 0.10240900367498398,
|
|
"num_tokens": 1079681.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"entropy": 6.858892154693604,
|
|
"epoch": 0.049569418189456,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0002945,
|
|
"loss": 6.5847,
|
|
"mean_token_accuracy": 0.10450911447405815,
|
|
"num_tokens": 1088979.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"entropy": 6.772767686843872,
|
|
"epoch": 0.04998949800462088,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000297,
|
|
"loss": 6.5832,
|
|
"mean_token_accuracy": 0.10501813441514969,
|
|
"num_tokens": 1097870.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"entropy": 6.856569433212281,
|
|
"epoch": 0.05040957781978576,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0002995,
|
|
"loss": 6.714,
|
|
"mean_token_accuracy": 0.09948427230119705,
|
|
"num_tokens": 1107948.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"entropy": 6.876928329467773,
|
|
"epoch": 0.05082965763495064,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000302,
|
|
"loss": 6.6226,
|
|
"mean_token_accuracy": 0.1076712541282177,
|
|
"num_tokens": 1117032.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"entropy": 6.769250106811524,
|
|
"epoch": 0.051249737450115525,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0003045,
|
|
"loss": 6.5928,
|
|
"mean_token_accuracy": 0.10671919211745262,
|
|
"num_tokens": 1127834.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"entropy": 6.948690032958984,
|
|
"epoch": 0.0516698172652804,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000307,
|
|
"loss": 6.695,
|
|
"mean_token_accuracy": 0.11499854996800422,
|
|
"num_tokens": 1137382.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"entropy": 6.77532000541687,
|
|
"epoch": 0.052089897080445285,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0003095,
|
|
"loss": 6.5422,
|
|
"mean_token_accuracy": 0.11223937124013901,
|
|
"num_tokens": 1146095.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"entropy": 6.723394155502319,
|
|
"epoch": 0.05250997689561017,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000312,
|
|
"loss": 6.5822,
|
|
"mean_token_accuracy": 0.10564726367592811,
|
|
"num_tokens": 1154981.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"entropy": 6.776411151885986,
|
|
"epoch": 0.052930056710775046,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0003145,
|
|
"loss": 6.6153,
|
|
"mean_token_accuracy": 0.10753953084349632,
|
|
"num_tokens": 1164939.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"entropy": 6.936794233322144,
|
|
"epoch": 0.05335013652593993,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000317,
|
|
"loss": 6.7291,
|
|
"mean_token_accuracy": 0.09790047407150268,
|
|
"num_tokens": 1174991.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"entropy": 6.818718576431275,
|
|
"epoch": 0.05377021634110481,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0003195,
|
|
"loss": 6.78,
|
|
"mean_token_accuracy": 0.09581352695822716,
|
|
"num_tokens": 1184885.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"entropy": 6.859689378738404,
|
|
"epoch": 0.05419029615626969,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.000322,
|
|
"loss": 6.6652,
|
|
"mean_token_accuracy": 0.10177846625447273,
|
|
"num_tokens": 1193637.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"entropy": 6.70958137512207,
|
|
"epoch": 0.054610375971434574,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00032450000000000003,
|
|
"loss": 6.4505,
|
|
"mean_token_accuracy": 0.11398516818881035,
|
|
"num_tokens": 1202188.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"entropy": 6.731061363220215,
|
|
"epoch": 0.05503045578659945,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00032700000000000003,
|
|
"loss": 6.5923,
|
|
"mean_token_accuracy": 0.10111142173409463,
|
|
"num_tokens": 1210768.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"entropy": 6.75755124092102,
|
|
"epoch": 0.055450535601764335,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00032950000000000004,
|
|
"loss": 6.5885,
|
|
"mean_token_accuracy": 0.10299575850367546,
|
|
"num_tokens": 1219819.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"entropy": 6.8775472164154055,
|
|
"epoch": 0.05587061541692922,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.00033200000000000005,
|
|
"loss": 6.6507,
|
|
"mean_token_accuracy": 0.09766614213585853,
|
|
"num_tokens": 1229703.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"entropy": 6.829215049743652,
|
|
"epoch": 0.056290695232094096,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00033450000000000005,
|
|
"loss": 6.6863,
|
|
"mean_token_accuracy": 0.09930930510163308,
|
|
"num_tokens": 1238942.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"entropy": 6.886805677413941,
|
|
"epoch": 0.05671077504725898,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000337,
|
|
"loss": 6.7475,
|
|
"mean_token_accuracy": 0.09512239620089531,
|
|
"num_tokens": 1248943.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"entropy": 6.774325275421143,
|
|
"epoch": 0.05713085486242386,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0003395,
|
|
"loss": 6.6164,
|
|
"mean_token_accuracy": 0.10321223735809326,
|
|
"num_tokens": 1257761.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"entropy": 6.6621216297149655,
|
|
"epoch": 0.05755093467758874,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000342,
|
|
"loss": 6.5622,
|
|
"mean_token_accuracy": 0.10228212624788284,
|
|
"num_tokens": 1267216.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"entropy": 6.826507520675659,
|
|
"epoch": 0.057971014492753624,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.00034449999999999997,
|
|
"loss": 6.6587,
|
|
"mean_token_accuracy": 0.1079720102250576,
|
|
"num_tokens": 1277210.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"entropy": 6.737741279602051,
|
|
"epoch": 0.05839109430791851,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000347,
|
|
"loss": 6.5588,
|
|
"mean_token_accuracy": 0.10001136437058449,
|
|
"num_tokens": 1285310.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"entropy": 6.800521755218506,
|
|
"epoch": 0.058811174123083385,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003495,
|
|
"loss": 6.5887,
|
|
"mean_token_accuracy": 0.10580191239714623,
|
|
"num_tokens": 1294421.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"entropy": 6.603023052215576,
|
|
"epoch": 0.05923125393824827,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000352,
|
|
"loss": 6.4007,
|
|
"mean_token_accuracy": 0.11451570391654968,
|
|
"num_tokens": 1303281.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"entropy": 6.694077110290527,
|
|
"epoch": 0.059651333753413145,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003545,
|
|
"loss": 6.5884,
|
|
"mean_token_accuracy": 0.1103569135069847,
|
|
"num_tokens": 1312280.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"entropy": 6.703026485443115,
|
|
"epoch": 0.06007141356857803,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000357,
|
|
"loss": 6.5455,
|
|
"mean_token_accuracy": 0.10655389800667762,
|
|
"num_tokens": 1321243.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"entropy": 6.783720779418945,
|
|
"epoch": 0.06049149338374291,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0003595,
|
|
"loss": 6.6752,
|
|
"mean_token_accuracy": 0.10890973284840584,
|
|
"num_tokens": 1330324.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"entropy": 6.716011047363281,
|
|
"epoch": 0.06091157319890779,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000362,
|
|
"loss": 6.4895,
|
|
"mean_token_accuracy": 0.11130202338099479,
|
|
"num_tokens": 1339485.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"entropy": 6.763300609588623,
|
|
"epoch": 0.06133165301407267,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0003645,
|
|
"loss": 6.6603,
|
|
"mean_token_accuracy": 0.09835303947329521,
|
|
"num_tokens": 1348640.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"entropy": 6.686880588531494,
|
|
"epoch": 0.06175173282923756,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000367,
|
|
"loss": 6.5352,
|
|
"mean_token_accuracy": 0.10910931676626205,
|
|
"num_tokens": 1357581.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"entropy": 6.789766788482666,
|
|
"epoch": 0.062171812644402434,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0003695,
|
|
"loss": 6.6092,
|
|
"mean_token_accuracy": 0.10686837136745453,
|
|
"num_tokens": 1367883.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"entropy": 6.689715576171875,
|
|
"epoch": 0.06259189245956731,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000372,
|
|
"loss": 6.5732,
|
|
"mean_token_accuracy": 0.10044000372290611,
|
|
"num_tokens": 1376936.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"entropy": 6.619902896881103,
|
|
"epoch": 0.0630119722747322,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0003745,
|
|
"loss": 6.5013,
|
|
"mean_token_accuracy": 0.10585071742534638,
|
|
"num_tokens": 1386359.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"entropy": 6.691353893280029,
|
|
"epoch": 0.06343205208989708,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000377,
|
|
"loss": 6.5614,
|
|
"mean_token_accuracy": 0.10925468727946282,
|
|
"num_tokens": 1395223.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"entropy": 6.756776332855225,
|
|
"epoch": 0.06385213190506196,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0003795,
|
|
"loss": 6.7048,
|
|
"mean_token_accuracy": 0.10100763738155365,
|
|
"num_tokens": 1404917.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"entropy": 6.7892101287841795,
|
|
"epoch": 0.06427221172022685,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000382,
|
|
"loss": 6.6077,
|
|
"mean_token_accuracy": 0.11203819289803504,
|
|
"num_tokens": 1413348.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"entropy": 6.617217540740967,
|
|
"epoch": 0.06469229153539173,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0003845,
|
|
"loss": 6.5804,
|
|
"mean_token_accuracy": 0.10595368966460228,
|
|
"num_tokens": 1421726.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"entropy": 6.699965381622315,
|
|
"epoch": 0.0651123713505566,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00038700000000000003,
|
|
"loss": 6.5984,
|
|
"mean_token_accuracy": 0.10766990706324578,
|
|
"num_tokens": 1430686.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"entropy": 6.773920488357544,
|
|
"epoch": 0.06553245116572148,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.00038950000000000003,
|
|
"loss": 6.5765,
|
|
"mean_token_accuracy": 0.10770290642976761,
|
|
"num_tokens": 1439499.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"entropy": 6.685867691040039,
|
|
"epoch": 0.06595253098088637,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.00039200000000000004,
|
|
"loss": 6.5731,
|
|
"mean_token_accuracy": 0.10584950372576714,
|
|
"num_tokens": 1448220.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"entropy": 6.635032224655151,
|
|
"epoch": 0.06637261079605125,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.00039450000000000005,
|
|
"loss": 6.5914,
|
|
"mean_token_accuracy": 0.09675629287958146,
|
|
"num_tokens": 1458217.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"entropy": 6.699159860610962,
|
|
"epoch": 0.06679269061121614,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00039700000000000005,
|
|
"loss": 6.4848,
|
|
"mean_token_accuracy": 0.10567129477858543,
|
|
"num_tokens": 1467422.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"entropy": 6.620410299301147,
|
|
"epoch": 0.06721277042638102,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0003995,
|
|
"loss": 6.4767,
|
|
"mean_token_accuracy": 0.11094664260745049,
|
|
"num_tokens": 1476152.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"entropy": 6.650699758529663,
|
|
"epoch": 0.06763285024154589,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.000402,
|
|
"loss": 6.5738,
|
|
"mean_token_accuracy": 0.1043787069618702,
|
|
"num_tokens": 1485248.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"entropy": 6.619400262832642,
|
|
"epoch": 0.06805293005671077,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004045,
|
|
"loss": 6.5511,
|
|
"mean_token_accuracy": 0.10442669913172722,
|
|
"num_tokens": 1494248.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"entropy": 6.722617673873901,
|
|
"epoch": 0.06847300987187566,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.00040699999999999997,
|
|
"loss": 6.6573,
|
|
"mean_token_accuracy": 0.10585515722632408,
|
|
"num_tokens": 1503565.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"entropy": 6.83908371925354,
|
|
"epoch": 0.06889308968704054,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004095,
|
|
"loss": 6.745,
|
|
"mean_token_accuracy": 0.10003346055746079,
|
|
"num_tokens": 1513227.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"entropy": 6.658945035934448,
|
|
"epoch": 0.06931316950220542,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000412,
|
|
"loss": 6.5346,
|
|
"mean_token_accuracy": 0.10508675500750542,
|
|
"num_tokens": 1522312.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"entropy": 6.637969160079956,
|
|
"epoch": 0.0697332493173703,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004145,
|
|
"loss": 6.4802,
|
|
"mean_token_accuracy": 0.10670675709843636,
|
|
"num_tokens": 1531720.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"entropy": 6.6340169429779055,
|
|
"epoch": 0.07015332913253518,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000417,
|
|
"loss": 6.5721,
|
|
"mean_token_accuracy": 0.10074454993009567,
|
|
"num_tokens": 1541238.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"entropy": 6.695564794540405,
|
|
"epoch": 0.07057340894770006,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004195,
|
|
"loss": 6.6648,
|
|
"mean_token_accuracy": 0.10375690832734108,
|
|
"num_tokens": 1550875.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"entropy": 6.645870971679687,
|
|
"epoch": 0.07099348876286495,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000422,
|
|
"loss": 6.6076,
|
|
"mean_token_accuracy": 0.10648187175393105,
|
|
"num_tokens": 1560287.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"entropy": 6.6967510223388675,
|
|
"epoch": 0.07141356857802983,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004245,
|
|
"loss": 6.4978,
|
|
"mean_token_accuracy": 0.11105224043130875,
|
|
"num_tokens": 1569043.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"entropy": 6.554346418380737,
|
|
"epoch": 0.07183364839319471,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000427,
|
|
"loss": 6.5154,
|
|
"mean_token_accuracy": 0.11203170269727707,
|
|
"num_tokens": 1578112.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"entropy": 6.515066003799438,
|
|
"epoch": 0.07225372820835958,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004295,
|
|
"loss": 6.4364,
|
|
"mean_token_accuracy": 0.11132391095161438,
|
|
"num_tokens": 1586587.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"entropy": 6.742719125747681,
|
|
"epoch": 0.07267380802352447,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.000432,
|
|
"loss": 6.5978,
|
|
"mean_token_accuracy": 0.10682642236351966,
|
|
"num_tokens": 1595585.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"entropy": 6.641049814224243,
|
|
"epoch": 0.07309388783868935,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004345,
|
|
"loss": 6.551,
|
|
"mean_token_accuracy": 0.10661023184657097,
|
|
"num_tokens": 1605355.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"entropy": 6.638308906555176,
|
|
"epoch": 0.07351396765385423,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000437,
|
|
"loss": 6.5889,
|
|
"mean_token_accuracy": 0.10184741988778115,
|
|
"num_tokens": 1613637.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"entropy": 6.658770608901977,
|
|
"epoch": 0.07393404746901912,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004395,
|
|
"loss": 6.5351,
|
|
"mean_token_accuracy": 0.10806669145822526,
|
|
"num_tokens": 1622731.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"entropy": 6.610404300689697,
|
|
"epoch": 0.074354127284184,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000442,
|
|
"loss": 6.5083,
|
|
"mean_token_accuracy": 0.10660439133644103,
|
|
"num_tokens": 1632098.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"entropy": 6.581112480163574,
|
|
"epoch": 0.07477420709934887,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004445,
|
|
"loss": 6.5117,
|
|
"mean_token_accuracy": 0.10190015733242035,
|
|
"num_tokens": 1641259.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"entropy": 6.669602966308593,
|
|
"epoch": 0.07519428691451376,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000447,
|
|
"loss": 6.5769,
|
|
"mean_token_accuracy": 0.103199552744627,
|
|
"num_tokens": 1651362.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"entropy": 6.581767272949219,
|
|
"epoch": 0.07561436672967864,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00044950000000000003,
|
|
"loss": 6.4893,
|
|
"mean_token_accuracy": 0.1063641555607319,
|
|
"num_tokens": 1660190.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"entropy": 6.573458099365235,
|
|
"epoch": 0.07603444654484352,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.00045200000000000004,
|
|
"loss": 6.5473,
|
|
"mean_token_accuracy": 0.10238413438200951,
|
|
"num_tokens": 1669020.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"entropy": 6.650429916381836,
|
|
"epoch": 0.0764545263600084,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.00045450000000000004,
|
|
"loss": 6.5657,
|
|
"mean_token_accuracy": 0.1067358523607254,
|
|
"num_tokens": 1678158.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"entropy": 6.640725135803223,
|
|
"epoch": 0.07687460617517328,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.00045700000000000005,
|
|
"loss": 6.5723,
|
|
"mean_token_accuracy": 0.10445328801870346,
|
|
"num_tokens": 1687481.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"entropy": 6.5850495338439945,
|
|
"epoch": 0.07729468599033816,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00045950000000000006,
|
|
"loss": 6.5327,
|
|
"mean_token_accuracy": 0.11253217458724976,
|
|
"num_tokens": 1696782.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"entropy": 6.555831384658814,
|
|
"epoch": 0.07771476580550304,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000462,
|
|
"loss": 6.528,
|
|
"mean_token_accuracy": 0.10824255496263505,
|
|
"num_tokens": 1706153.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"entropy": 6.625135850906372,
|
|
"epoch": 0.07813484562066793,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004645,
|
|
"loss": 6.559,
|
|
"mean_token_accuracy": 0.1069357268512249,
|
|
"num_tokens": 1715585.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"entropy": 6.768569469451904,
|
|
"epoch": 0.07855492543583281,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000467,
|
|
"loss": 6.6818,
|
|
"mean_token_accuracy": 0.10121209248900413,
|
|
"num_tokens": 1724857.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"entropy": 6.553330516815185,
|
|
"epoch": 0.0789750052509977,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004695,
|
|
"loss": 6.4953,
|
|
"mean_token_accuracy": 0.11398640796542167,
|
|
"num_tokens": 1733528.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"entropy": 6.64405460357666,
|
|
"epoch": 0.07939508506616257,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000472,
|
|
"loss": 6.616,
|
|
"mean_token_accuracy": 0.10737027525901795,
|
|
"num_tokens": 1742953.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"entropy": 6.612694597244262,
|
|
"epoch": 0.07981516488132745,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004745,
|
|
"loss": 6.5605,
|
|
"mean_token_accuracy": 0.11337294653058053,
|
|
"num_tokens": 1752155.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"entropy": 6.579654312133789,
|
|
"epoch": 0.08023524469649233,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000477,
|
|
"loss": 6.4539,
|
|
"mean_token_accuracy": 0.10857293009757996,
|
|
"num_tokens": 1760562.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"entropy": 6.568000841140747,
|
|
"epoch": 0.08065532451165722,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004795,
|
|
"loss": 6.4953,
|
|
"mean_token_accuracy": 0.10662117302417755,
|
|
"num_tokens": 1769631.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"entropy": 6.530600309371948,
|
|
"epoch": 0.0810754043268221,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000482,
|
|
"loss": 6.513,
|
|
"mean_token_accuracy": 0.10268357619643212,
|
|
"num_tokens": 1779080.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"entropy": 6.608699893951416,
|
|
"epoch": 0.08149548414198697,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004845,
|
|
"loss": 6.5062,
|
|
"mean_token_accuracy": 0.1082501009106636,
|
|
"num_tokens": 1787830.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"entropy": 6.3936583518981935,
|
|
"epoch": 0.08191556395715185,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000487,
|
|
"loss": 6.4399,
|
|
"mean_token_accuracy": 0.10422300174832344,
|
|
"num_tokens": 1796998.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"entropy": 6.6043681621551515,
|
|
"epoch": 0.08233564377231674,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004895,
|
|
"loss": 6.5165,
|
|
"mean_token_accuracy": 0.1103883646428585,
|
|
"num_tokens": 1806194.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"entropy": 6.372901821136475,
|
|
"epoch": 0.08275572358748162,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000492,
|
|
"loss": 6.4139,
|
|
"mean_token_accuracy": 0.11206447035074234,
|
|
"num_tokens": 1815751.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"entropy": 6.442894506454468,
|
|
"epoch": 0.0831758034026465,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004945,
|
|
"loss": 6.4907,
|
|
"mean_token_accuracy": 0.11046408414840699,
|
|
"num_tokens": 1825379.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"entropy": 6.55560712814331,
|
|
"epoch": 0.08359588321781139,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000497,
|
|
"loss": 6.454,
|
|
"mean_token_accuracy": 0.10851948186755181,
|
|
"num_tokens": 1834158.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"entropy": 6.516812181472778,
|
|
"epoch": 0.08401596303297626,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004995,
|
|
"loss": 6.4132,
|
|
"mean_token_accuracy": 0.10896480083465576,
|
|
"num_tokens": 1842724.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"entropy": 6.518254280090332,
|
|
"epoch": 0.08443604284814114,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499999998724557,
|
|
"loss": 6.4359,
|
|
"mean_token_accuracy": 0.11062911972403526,
|
|
"num_tokens": 1852485.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"entropy": 6.472753667831421,
|
|
"epoch": 0.08485612266330603,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999999935430703,
|
|
"loss": 6.4668,
|
|
"mean_token_accuracy": 0.11211320757865906,
|
|
"num_tokens": 1861303.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"entropy": 6.340228652954101,
|
|
"epoch": 0.08527620247847091,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999999843758243,
|
|
"loss": 6.4328,
|
|
"mean_token_accuracy": 0.11544003784656524,
|
|
"num_tokens": 1870859.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"entropy": 6.671287488937378,
|
|
"epoch": 0.0856962822936358,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999999712228196,
|
|
"loss": 6.6205,
|
|
"mean_token_accuracy": 0.0996169812977314,
|
|
"num_tokens": 1880295.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"entropy": 6.63971586227417,
|
|
"epoch": 0.08611636210880068,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999999540840562,
|
|
"loss": 6.5111,
|
|
"mean_token_accuracy": 0.11260380744934081,
|
|
"num_tokens": 1889193.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"entropy": 6.493311834335327,
|
|
"epoch": 0.08653644192396555,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999999329595345,
|
|
"loss": 6.6021,
|
|
"mean_token_accuracy": 0.09916243478655815,
|
|
"num_tokens": 1899437.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"entropy": 6.5729930877685545,
|
|
"epoch": 0.08695652173913043,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999999078492548,
|
|
"loss": 6.5058,
|
|
"mean_token_accuracy": 0.10446131974458694,
|
|
"num_tokens": 1907882.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"entropy": 6.492805910110474,
|
|
"epoch": 0.08737660155429532,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999998787532176,
|
|
"loss": 6.4138,
|
|
"mean_token_accuracy": 0.1126400038599968,
|
|
"num_tokens": 1916872.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"entropy": 6.506380701065064,
|
|
"epoch": 0.0877966813694602,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999998456714234,
|
|
"loss": 6.5924,
|
|
"mean_token_accuracy": 0.11272363439202308,
|
|
"num_tokens": 1926636.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"entropy": 6.5269097805023195,
|
|
"epoch": 0.08821676118462508,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999998086038729,
|
|
"loss": 6.4905,
|
|
"mean_token_accuracy": 0.10973675772547722,
|
|
"num_tokens": 1935962.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"entropy": 6.491125774383545,
|
|
"epoch": 0.08863684099978995,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999997675505665,
|
|
"loss": 6.4506,
|
|
"mean_token_accuracy": 0.11091897338628769,
|
|
"num_tokens": 1944600.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"entropy": 6.560743236541748,
|
|
"epoch": 0.08905692081495484,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999997225115052,
|
|
"loss": 6.6403,
|
|
"mean_token_accuracy": 0.10566280484199524,
|
|
"num_tokens": 1954234.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"entropy": 6.71457724571228,
|
|
"epoch": 0.08947700063011972,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999996734866896,
|
|
"loss": 6.5989,
|
|
"mean_token_accuracy": 0.10413395762443542,
|
|
"num_tokens": 1964499.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"entropy": 6.307662582397461,
|
|
"epoch": 0.0898970804452846,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999996204761206,
|
|
"loss": 6.3004,
|
|
"mean_token_accuracy": 0.11687865033745766,
|
|
"num_tokens": 1973635.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"entropy": 6.497924661636352,
|
|
"epoch": 0.09031716026044949,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999995634797993,
|
|
"loss": 6.4546,
|
|
"mean_token_accuracy": 0.11474235132336616,
|
|
"num_tokens": 1983509.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"entropy": 6.480887317657471,
|
|
"epoch": 0.09073724007561437,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999995024977265,
|
|
"loss": 6.4317,
|
|
"mean_token_accuracy": 0.11495376154780387,
|
|
"num_tokens": 1992336.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"entropy": 6.518788290023804,
|
|
"epoch": 0.09115731989077924,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999994375299034,
|
|
"loss": 6.467,
|
|
"mean_token_accuracy": 0.11141329482197762,
|
|
"num_tokens": 2001931.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"entropy": 6.396731853485107,
|
|
"epoch": 0.09157739970594413,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499999368576331,
|
|
"loss": 6.3268,
|
|
"mean_token_accuracy": 0.11944503113627433,
|
|
"num_tokens": 2010935.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"entropy": 6.410158395767212,
|
|
"epoch": 0.09199747952110901,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999992956370109,
|
|
"loss": 6.3911,
|
|
"mean_token_accuracy": 0.1134261205792427,
|
|
"num_tokens": 2020587.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"entropy": 6.34518141746521,
|
|
"epoch": 0.0924175593362739,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499999218711944,
|
|
"loss": 6.4155,
|
|
"mean_token_accuracy": 0.11149835661053657,
|
|
"num_tokens": 2029743.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"entropy": 6.548259019851685,
|
|
"epoch": 0.09283763915143878,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999991378011317,
|
|
"loss": 6.4423,
|
|
"mean_token_accuracy": 0.11465151533484459,
|
|
"num_tokens": 2038468.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"entropy": 6.381483364105224,
|
|
"epoch": 0.09325771896660366,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999990529045757,
|
|
"loss": 6.3588,
|
|
"mean_token_accuracy": 0.11419494673609734,
|
|
"num_tokens": 2047456.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"entropy": 6.610185289382935,
|
|
"epoch": 0.09367779878176853,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999989640222771,
|
|
"loss": 6.6665,
|
|
"mean_token_accuracy": 0.09994395300745965,
|
|
"num_tokens": 2056691.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"entropy": 6.5219189643859865,
|
|
"epoch": 0.09409787859693342,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499998871154238,
|
|
"loss": 6.4644,
|
|
"mean_token_accuracy": 0.10945621505379677,
|
|
"num_tokens": 2066068.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"entropy": 6.522899913787842,
|
|
"epoch": 0.0945179584120983,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999987743004597,
|
|
"loss": 6.3814,
|
|
"mean_token_accuracy": 0.11605900377035142,
|
|
"num_tokens": 2075113.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"entropy": 6.463716268539429,
|
|
"epoch": 0.09493803822726318,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999986734609438,
|
|
"loss": 6.5424,
|
|
"mean_token_accuracy": 0.11121912077069282,
|
|
"num_tokens": 2084557.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"entropy": 6.5195088386535645,
|
|
"epoch": 0.09535811804242807,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999985686356923,
|
|
"loss": 6.4293,
|
|
"mean_token_accuracy": 0.10896992832422256,
|
|
"num_tokens": 2093424.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"entropy": 6.501124429702759,
|
|
"epoch": 0.09577819785759294,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499998459824707,
|
|
"loss": 6.541,
|
|
"mean_token_accuracy": 0.10720071643590927,
|
|
"num_tokens": 2103066.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"entropy": 6.485859060287476,
|
|
"epoch": 0.09619827767275782,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.00049999834702799,
|
|
"loss": 6.4218,
|
|
"mean_token_accuracy": 0.11379043385386467,
|
|
"num_tokens": 2112447.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"entropy": 6.421380949020386,
|
|
"epoch": 0.0966183574879227,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999982302455431,
|
|
"loss": 6.4471,
|
|
"mean_token_accuracy": 0.11498942598700523,
|
|
"num_tokens": 2121949.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"entropy": 6.473872327804566,
|
|
"epoch": 0.09703843730308759,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999981094773683,
|
|
"loss": 6.3538,
|
|
"mean_token_accuracy": 0.11318295449018478,
|
|
"num_tokens": 2130464.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"entropy": 6.440775918960571,
|
|
"epoch": 0.09745851711825247,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.000499997984723468,
|
|
"loss": 6.516,
|
|
"mean_token_accuracy": 0.10744207352399826,
|
|
"num_tokens": 2139577.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"entropy": 6.234747648239136,
|
|
"epoch": 0.09787859693341736,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999978559838441,
|
|
"loss": 6.2367,
|
|
"mean_token_accuracy": 0.11980548799037934,
|
|
"num_tokens": 2147919.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"entropy": 6.425514888763428,
|
|
"epoch": 0.09829867674858223,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999977232584991,
|
|
"loss": 6.411,
|
|
"mean_token_accuracy": 0.11489588171243667,
|
|
"num_tokens": 2156936.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"entropy": 6.470327949523925,
|
|
"epoch": 0.09871875656374711,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999975865474354,
|
|
"loss": 6.474,
|
|
"mean_token_accuracy": 0.11181759610772132,
|
|
"num_tokens": 2165362.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"entropy": 6.40549669265747,
|
|
"epoch": 0.099138836378912,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999974458506551,
|
|
"loss": 6.3913,
|
|
"mean_token_accuracy": 0.11321083605289459,
|
|
"num_tokens": 2173665.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"entropy": 6.484990215301513,
|
|
"epoch": 0.09955891619407688,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000499997301168161,
|
|
"loss": 6.3547,
|
|
"mean_token_accuracy": 0.1165615513920784,
|
|
"num_tokens": 2182222.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"entropy": 6.47981333732605,
|
|
"epoch": 0.09997899600924176,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999971524999556,
|
|
"loss": 6.4586,
|
|
"mean_token_accuracy": 0.11381722316145897,
|
|
"num_tokens": 2192358.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"entropy": 6.44229564666748,
|
|
"epoch": 0.10039907582440663,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999969998460414,
|
|
"loss": 6.4519,
|
|
"mean_token_accuracy": 0.11379328668117523,
|
|
"num_tokens": 2201889.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"entropy": 6.39982123374939,
|
|
"epoch": 0.10081915563957151,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004999968432064213,
|
|
"loss": 6.4571,
|
|
"mean_token_accuracy": 0.11692695915699006,
|
|
"num_tokens": 2211810.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"entropy": 6.344041538238526,
|
|
"epoch": 0.1012392354547364,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999966825810979,
|
|
"loss": 6.401,
|
|
"mean_token_accuracy": 0.11303748413920403,
|
|
"num_tokens": 2221123.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"entropy": 6.35064206123352,
|
|
"epoch": 0.10165931526990128,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999965179700742,
|
|
"loss": 6.3287,
|
|
"mean_token_accuracy": 0.11802673861384391,
|
|
"num_tokens": 2230129.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"entropy": 6.3616985321044925,
|
|
"epoch": 0.10207939508506617,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499996349373353,
|
|
"loss": 6.3828,
|
|
"mean_token_accuracy": 0.11236807405948639,
|
|
"num_tokens": 2239929.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"entropy": 6.4787780284881595,
|
|
"epoch": 0.10249947490023105,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999961767909374,
|
|
"loss": 6.3565,
|
|
"mean_token_accuracy": 0.11246357783675194,
|
|
"num_tokens": 2248078.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"entropy": 6.329130172729492,
|
|
"epoch": 0.10291955471539592,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999960002228303,
|
|
"loss": 6.4464,
|
|
"mean_token_accuracy": 0.11457708552479744,
|
|
"num_tokens": 2256975.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"entropy": 6.41359543800354,
|
|
"epoch": 0.1033396345305608,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999958196690349,
|
|
"loss": 6.3075,
|
|
"mean_token_accuracy": 0.11755945533514023,
|
|
"num_tokens": 2265797.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"entropy": 6.318053436279297,
|
|
"epoch": 0.10375971434572569,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999956351295545,
|
|
"loss": 6.407,
|
|
"mean_token_accuracy": 0.12029065862298012,
|
|
"num_tokens": 2274099.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"entropy": 6.369567108154297,
|
|
"epoch": 0.10417979416089057,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999954466043922,
|
|
"loss": 6.3363,
|
|
"mean_token_accuracy": 0.11501603052020073,
|
|
"num_tokens": 2282360.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"entropy": 6.357530832290649,
|
|
"epoch": 0.10459987397605545,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999952540935514,
|
|
"loss": 6.427,
|
|
"mean_token_accuracy": 0.10687654167413711,
|
|
"num_tokens": 2292714.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"entropy": 6.4097044467926025,
|
|
"epoch": 0.10501995379122034,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999950575970356,
|
|
"loss": 6.3634,
|
|
"mean_token_accuracy": 0.11621319502592087,
|
|
"num_tokens": 2301633.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"entropy": 6.369806289672852,
|
|
"epoch": 0.10544003360638521,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999948571148482,
|
|
"loss": 6.3449,
|
|
"mean_token_accuracy": 0.12026969790458679,
|
|
"num_tokens": 2310067.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"entropy": 6.404969978332519,
|
|
"epoch": 0.10586011342155009,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999946526469927,
|
|
"loss": 6.4302,
|
|
"mean_token_accuracy": 0.11556015834212303,
|
|
"num_tokens": 2320090.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"entropy": 6.372836637496948,
|
|
"epoch": 0.10628019323671498,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999944441934728,
|
|
"loss": 6.3703,
|
|
"mean_token_accuracy": 0.1213706873357296,
|
|
"num_tokens": 2329255.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"entropy": 6.461856746673584,
|
|
"epoch": 0.10670027305187986,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999942317542922,
|
|
"loss": 6.4648,
|
|
"mean_token_accuracy": 0.11396320164203644,
|
|
"num_tokens": 2339535.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"entropy": 6.327042865753174,
|
|
"epoch": 0.10712035286704474,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999940153294546,
|
|
"loss": 6.3664,
|
|
"mean_token_accuracy": 0.11329737976193428,
|
|
"num_tokens": 2348948.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"entropy": 6.429010534286499,
|
|
"epoch": 0.10754043268220961,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.000499993794918964,
|
|
"loss": 6.3999,
|
|
"mean_token_accuracy": 0.11255929544568062,
|
|
"num_tokens": 2359141.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"entropy": 6.310593366622925,
|
|
"epoch": 0.1079605124973745,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999935705228241,
|
|
"loss": 6.4541,
|
|
"mean_token_accuracy": 0.11158784702420235,
|
|
"num_tokens": 2368906.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"entropy": 6.527987766265869,
|
|
"epoch": 0.10838059231253938,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999933421410389,
|
|
"loss": 6.4033,
|
|
"mean_token_accuracy": 0.11808300763368607,
|
|
"num_tokens": 2377029.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"entropy": 6.42291259765625,
|
|
"epoch": 0.10880067212770426,
|
|
"grad_norm": 0.8984375,
|
|
"learning_rate": 0.0004999931097736125,
|
|
"loss": 6.4774,
|
|
"mean_token_accuracy": 0.11096713319420815,
|
|
"num_tokens": 2387088.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"entropy": 6.416878700256348,
|
|
"epoch": 0.10922075194286915,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999928734205492,
|
|
"loss": 6.3793,
|
|
"mean_token_accuracy": 0.11391115635633468,
|
|
"num_tokens": 2395596.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"entropy": 6.3132000923156735,
|
|
"epoch": 0.10964083175803403,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999926330818528,
|
|
"loss": 6.3632,
|
|
"mean_token_accuracy": 0.12021223455667496,
|
|
"num_tokens": 2404506.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"entropy": 6.388805866241455,
|
|
"epoch": 0.1100609115731989,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999923887575278,
|
|
"loss": 6.4068,
|
|
"mean_token_accuracy": 0.11463478580117226,
|
|
"num_tokens": 2414342.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"entropy": 6.404540014266968,
|
|
"epoch": 0.11048099138836379,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999921404475785,
|
|
"loss": 6.3616,
|
|
"mean_token_accuracy": 0.11288373470306397,
|
|
"num_tokens": 2423076.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"entropy": 6.345643043518066,
|
|
"epoch": 0.11090107120352867,
|
|
"grad_norm": 0.96875,
|
|
"learning_rate": 0.0004999918881520093,
|
|
"loss": 6.3217,
|
|
"mean_token_accuracy": 0.12194343283772469,
|
|
"num_tokens": 2432492.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"entropy": 6.29752926826477,
|
|
"epoch": 0.11132115101869355,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999916318708246,
|
|
"loss": 6.2719,
|
|
"mean_token_accuracy": 0.12224209308624268,
|
|
"num_tokens": 2441916.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"entropy": 6.3799408912658695,
|
|
"epoch": 0.11174123083385844,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999913716040291,
|
|
"loss": 6.3145,
|
|
"mean_token_accuracy": 0.12352623343467713,
|
|
"num_tokens": 2450932.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"entropy": 6.248048543930054,
|
|
"epoch": 0.11216131064902331,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999911073516272,
|
|
"loss": 6.354,
|
|
"mean_token_accuracy": 0.11721153929829597,
|
|
"num_tokens": 2460058.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"entropy": 6.305795478820801,
|
|
"epoch": 0.11258139046418819,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999908391136237,
|
|
"loss": 6.3027,
|
|
"mean_token_accuracy": 0.11657693609595299,
|
|
"num_tokens": 2469607.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"entropy": 6.403741073608399,
|
|
"epoch": 0.11300147027935308,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999905668900234,
|
|
"loss": 6.3338,
|
|
"mean_token_accuracy": 0.11997214257717133,
|
|
"num_tokens": 2478345.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"entropy": 6.354722595214843,
|
|
"epoch": 0.11342155009451796,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499990290680831,
|
|
"loss": 6.2712,
|
|
"mean_token_accuracy": 0.12181988656520844,
|
|
"num_tokens": 2486662.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"entropy": 6.314315986633301,
|
|
"epoch": 0.11384162990968284,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999900104860516,
|
|
"loss": 6.4018,
|
|
"mean_token_accuracy": 0.1190257914364338,
|
|
"num_tokens": 2495392.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"entropy": 6.396038579940796,
|
|
"epoch": 0.11426170972484773,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999897263056898,
|
|
"loss": 6.4374,
|
|
"mean_token_accuracy": 0.11400101035833358,
|
|
"num_tokens": 2505254.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"entropy": 6.415481328964233,
|
|
"epoch": 0.1146817895400126,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499989438139751,
|
|
"loss": 6.242,
|
|
"mean_token_accuracy": 0.12410487607121468,
|
|
"num_tokens": 2514096.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"entropy": 6.27006311416626,
|
|
"epoch": 0.11510186935517748,
|
|
"grad_norm": 0.9140625,
|
|
"learning_rate": 0.0004999891459882401,
|
|
"loss": 6.2694,
|
|
"mean_token_accuracy": 0.11747925281524658,
|
|
"num_tokens": 2523635.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"entropy": 6.264933919906616,
|
|
"epoch": 0.11552194917034236,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999888498511624,
|
|
"loss": 6.3385,
|
|
"mean_token_accuracy": 0.11789564937353134,
|
|
"num_tokens": 2532528.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"entropy": 6.311061573028565,
|
|
"epoch": 0.11594202898550725,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999885497285229,
|
|
"loss": 6.2494,
|
|
"mean_token_accuracy": 0.1151455245912075,
|
|
"num_tokens": 2541893.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"entropy": 6.299208688735962,
|
|
"epoch": 0.11636210880067213,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999882456203273,
|
|
"loss": 6.3013,
|
|
"mean_token_accuracy": 0.12206007465720177,
|
|
"num_tokens": 2551551.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"entropy": 6.300058650970459,
|
|
"epoch": 0.11678218861583702,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004999879375265806,
|
|
"loss": 6.2409,
|
|
"mean_token_accuracy": 0.11834143102169037,
|
|
"num_tokens": 2560183.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"entropy": 6.282702207565308,
|
|
"epoch": 0.11720226843100189,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999876254472886,
|
|
"loss": 6.1445,
|
|
"mean_token_accuracy": 0.12687626332044602,
|
|
"num_tokens": 2568697.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"entropy": 6.272518634796143,
|
|
"epoch": 0.11762234824616677,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999873093824565,
|
|
"loss": 6.3599,
|
|
"mean_token_accuracy": 0.11727140471339226,
|
|
"num_tokens": 2578151.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"entropy": 6.458877515792847,
|
|
"epoch": 0.11804242806133165,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999869893320902,
|
|
"loss": 6.466,
|
|
"mean_token_accuracy": 0.11793015375733376,
|
|
"num_tokens": 2585901.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"entropy": 6.321643972396851,
|
|
"epoch": 0.11846250787649654,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999866652961952,
|
|
"loss": 6.3084,
|
|
"mean_token_accuracy": 0.11437714174389839,
|
|
"num_tokens": 2595655.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"entropy": 6.353274202346801,
|
|
"epoch": 0.11888258769166142,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999863372747773,
|
|
"loss": 6.2695,
|
|
"mean_token_accuracy": 0.11410242393612861,
|
|
"num_tokens": 2604949.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"entropy": 6.335246944427491,
|
|
"epoch": 0.11930266750682629,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999860052678423,
|
|
"loss": 6.3408,
|
|
"mean_token_accuracy": 0.11986073330044747,
|
|
"num_tokens": 2614260.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"entropy": 6.268476724624634,
|
|
"epoch": 0.11972274732199117,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004999856692753959,
|
|
"loss": 6.3378,
|
|
"mean_token_accuracy": 0.11452461034059525,
|
|
"num_tokens": 2623740.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"entropy": 6.361990165710449,
|
|
"epoch": 0.12014282713715606,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999853292974444,
|
|
"loss": 6.2447,
|
|
"mean_token_accuracy": 0.11876029148697853,
|
|
"num_tokens": 2631998.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"entropy": 6.266280698776245,
|
|
"epoch": 0.12056290695232094,
|
|
"grad_norm": 0.99609375,
|
|
"learning_rate": 0.0004999849853339936,
|
|
"loss": 6.3713,
|
|
"mean_token_accuracy": 0.12363618090748787,
|
|
"num_tokens": 2641169.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"entropy": 6.410887956619263,
|
|
"epoch": 0.12098298676748583,
|
|
"grad_norm": 0.94140625,
|
|
"learning_rate": 0.0004999846373850497,
|
|
"loss": 6.2122,
|
|
"mean_token_accuracy": 0.12586963474750518,
|
|
"num_tokens": 2650576.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"entropy": 6.177606773376465,
|
|
"epoch": 0.12140306658265071,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999842854506186,
|
|
"loss": 6.3172,
|
|
"mean_token_accuracy": 0.11797089278697967,
|
|
"num_tokens": 2660817.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"entropy": 6.282220935821533,
|
|
"epoch": 0.12182314639781558,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999839295307069,
|
|
"loss": 6.2561,
|
|
"mean_token_accuracy": 0.1204748846590519,
|
|
"num_tokens": 2669338.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"entropy": 6.330542469024659,
|
|
"epoch": 0.12224322621298046,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999835696253206,
|
|
"loss": 6.3117,
|
|
"mean_token_accuracy": 0.11790118813514709,
|
|
"num_tokens": 2679108.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"entropy": 6.36133828163147,
|
|
"epoch": 0.12266330602814535,
|
|
"grad_norm": 0.97265625,
|
|
"learning_rate": 0.0004999832057344664,
|
|
"loss": 6.2739,
|
|
"mean_token_accuracy": 0.11782214790582657,
|
|
"num_tokens": 2688126.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"entropy": 6.110802221298218,
|
|
"epoch": 0.12308338584331023,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999828378581504,
|
|
"loss": 6.2493,
|
|
"mean_token_accuracy": 0.1267126329243183,
|
|
"num_tokens": 2697245.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"entropy": 6.332847547531128,
|
|
"epoch": 0.12350346565847511,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999824659963793,
|
|
"loss": 6.3012,
|
|
"mean_token_accuracy": 0.12391207665205002,
|
|
"num_tokens": 2705934.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"entropy": 6.2204491138458256,
|
|
"epoch": 0.12392354547364,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004999820901491598,
|
|
"loss": 6.2299,
|
|
"mean_token_accuracy": 0.12465188652276993,
|
|
"num_tokens": 2714367.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"entropy": 6.265383291244507,
|
|
"epoch": 0.12434362528880487,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999817103164983,
|
|
"loss": 6.2882,
|
|
"mean_token_accuracy": 0.12172888666391372,
|
|
"num_tokens": 2724366.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"entropy": 6.282348680496216,
|
|
"epoch": 0.12476370510396975,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999813264984017,
|
|
"loss": 6.284,
|
|
"mean_token_accuracy": 0.11969415470957756,
|
|
"num_tokens": 2733980.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"entropy": 6.340108251571655,
|
|
"epoch": 0.12518378491913462,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004999809386948767,
|
|
"loss": 6.2714,
|
|
"mean_token_accuracy": 0.12543393298983574,
|
|
"num_tokens": 2744013.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"entropy": 6.2037091732025145,
|
|
"epoch": 0.12560386473429952,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999805469059302,
|
|
"loss": 6.3445,
|
|
"mean_token_accuracy": 0.11714137867093086,
|
|
"num_tokens": 2753385.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"entropy": 6.296666240692138,
|
|
"epoch": 0.1260239445494644,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999801511315693,
|
|
"loss": 6.1931,
|
|
"mean_token_accuracy": 0.12192152738571167,
|
|
"num_tokens": 2762875.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"entropy": 6.284714651107788,
|
|
"epoch": 0.1264440243646293,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999797513718007,
|
|
"loss": 6.2602,
|
|
"mean_token_accuracy": 0.12598440870642663,
|
|
"num_tokens": 2772182.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"entropy": 6.161528825759888,
|
|
"epoch": 0.12686410417979416,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999793476266317,
|
|
"loss": 6.2127,
|
|
"mean_token_accuracy": 0.12566941007971763,
|
|
"num_tokens": 2780814.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"entropy": 6.568186330795288,
|
|
"epoch": 0.12728418399495905,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999789398960695,
|
|
"loss": 6.483,
|
|
"mean_token_accuracy": 0.11876541525125503,
|
|
"num_tokens": 2791104.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"entropy": 6.092951726913452,
|
|
"epoch": 0.12770426381012392,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999785281801212,
|
|
"loss": 6.1965,
|
|
"mean_token_accuracy": 0.12290481179952621,
|
|
"num_tokens": 2800081.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"entropy": 6.291093444824218,
|
|
"epoch": 0.1281243436252888,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499978112478794,
|
|
"loss": 6.3106,
|
|
"mean_token_accuracy": 0.12357923462986946,
|
|
"num_tokens": 2809096.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"entropy": 6.3231532096862795,
|
|
"epoch": 0.1285444234404537,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999776927920955,
|
|
"loss": 6.2832,
|
|
"mean_token_accuracy": 0.11848211586475373,
|
|
"num_tokens": 2818857.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"entropy": 6.26645565032959,
|
|
"epoch": 0.12896450325561856,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499977269120033,
|
|
"loss": 6.3691,
|
|
"mean_token_accuracy": 0.11801103353500367,
|
|
"num_tokens": 2829332.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"entropy": 6.273476028442383,
|
|
"epoch": 0.12938458307078346,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.000499976841462614,
|
|
"loss": 6.2806,
|
|
"mean_token_accuracy": 0.1197669893503189,
|
|
"num_tokens": 2839193.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"entropy": 6.308599948883057,
|
|
"epoch": 0.12980466288594833,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.000499976409819846,
|
|
"loss": 6.2735,
|
|
"mean_token_accuracy": 0.11774821653962135,
|
|
"num_tokens": 2848535.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"entropy": 6.131243658065796,
|
|
"epoch": 0.1302247427011132,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999759741917369,
|
|
"loss": 6.1661,
|
|
"mean_token_accuracy": 0.12612850666046144,
|
|
"num_tokens": 2858090.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"entropy": 6.301682853698731,
|
|
"epoch": 0.1306448225162781,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004999755345782941,
|
|
"loss": 6.3181,
|
|
"mean_token_accuracy": 0.1226440578699112,
|
|
"num_tokens": 2866984.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"entropy": 6.184937286376953,
|
|
"epoch": 0.13106490233144297,
|
|
"grad_norm": 0.94921875,
|
|
"learning_rate": 0.0004999750909795256,
|
|
"loss": 6.1325,
|
|
"mean_token_accuracy": 0.12440444529056549,
|
|
"num_tokens": 2876550.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"entropy": 6.258828830718994,
|
|
"epoch": 0.13148498214660786,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999746433954394,
|
|
"loss": 6.241,
|
|
"mean_token_accuracy": 0.12188669666647911,
|
|
"num_tokens": 2885782.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"entropy": 6.217999792098999,
|
|
"epoch": 0.13190506196177273,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499974191826043,
|
|
"loss": 6.2134,
|
|
"mean_token_accuracy": 0.1303790420293808,
|
|
"num_tokens": 2894807.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"entropy": 6.286883115768433,
|
|
"epoch": 0.1323251417769376,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999737362713448,
|
|
"loss": 6.2503,
|
|
"mean_token_accuracy": 0.12286639586091042,
|
|
"num_tokens": 2904076.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"entropy": 6.20257887840271,
|
|
"epoch": 0.1327452215921025,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004999732767313527,
|
|
"loss": 6.1442,
|
|
"mean_token_accuracy": 0.12661461755633355,
|
|
"num_tokens": 2913761.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"entropy": 6.346931409835816,
|
|
"epoch": 0.13316530140726737,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999728132060746,
|
|
"loss": 6.3898,
|
|
"mean_token_accuracy": 0.12459043636918068,
|
|
"num_tokens": 2922848.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"entropy": 6.276056003570557,
|
|
"epoch": 0.13358538122243227,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999723456955192,
|
|
"loss": 6.271,
|
|
"mean_token_accuracy": 0.1242086909711361,
|
|
"num_tokens": 2932718.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"entropy": 6.151839303970337,
|
|
"epoch": 0.13400546103759714,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999718741996945,
|
|
"loss": 6.2133,
|
|
"mean_token_accuracy": 0.12332009747624398,
|
|
"num_tokens": 2942686.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"entropy": 6.2299316883087155,
|
|
"epoch": 0.13442554085276204,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499971398718609,
|
|
"loss": 6.194,
|
|
"mean_token_accuracy": 0.12265397682785988,
|
|
"num_tokens": 2952096.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"entropy": 6.27053918838501,
|
|
"epoch": 0.1348456206679269,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999709192522708,
|
|
"loss": 6.2496,
|
|
"mean_token_accuracy": 0.12414331436157226,
|
|
"num_tokens": 2960660.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"entropy": 6.299257707595825,
|
|
"epoch": 0.13526570048309178,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999704358006887,
|
|
"loss": 6.2485,
|
|
"mean_token_accuracy": 0.1208350658416748,
|
|
"num_tokens": 2969834.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"entropy": 6.205888414382935,
|
|
"epoch": 0.13568578029825668,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999699483638712,
|
|
"loss": 6.2531,
|
|
"mean_token_accuracy": 0.12345886677503586,
|
|
"num_tokens": 2979023.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"entropy": 6.2445619106292725,
|
|
"epoch": 0.13610586011342155,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999694569418269,
|
|
"loss": 6.2532,
|
|
"mean_token_accuracy": 0.12339803278446197,
|
|
"num_tokens": 2988083.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"entropy": 6.20722599029541,
|
|
"epoch": 0.13652593992858644,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999689615345645,
|
|
"loss": 6.1689,
|
|
"mean_token_accuracy": 0.1274717427790165,
|
|
"num_tokens": 2997240.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"entropy": 6.2464118003845215,
|
|
"epoch": 0.1369460197437513,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999684621420928,
|
|
"loss": 6.2565,
|
|
"mean_token_accuracy": 0.12297938466072082,
|
|
"num_tokens": 3007077.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"entropy": 6.293700885772705,
|
|
"epoch": 0.13736609955891618,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999679587644205,
|
|
"loss": 6.2787,
|
|
"mean_token_accuracy": 0.1208807609975338,
|
|
"num_tokens": 3015821.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"entropy": 6.17975435256958,
|
|
"epoch": 0.13778617937408108,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999674514015568,
|
|
"loss": 6.2054,
|
|
"mean_token_accuracy": 0.1252801388502121,
|
|
"num_tokens": 3025858.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"entropy": 6.2544519901275635,
|
|
"epoch": 0.13820625918924595,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999669400535105,
|
|
"loss": 6.1887,
|
|
"mean_token_accuracy": 0.11709433272480965,
|
|
"num_tokens": 3035537.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"entropy": 6.045716142654419,
|
|
"epoch": 0.13862633900441085,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999664247202907,
|
|
"loss": 6.1026,
|
|
"mean_token_accuracy": 0.12171815410256386,
|
|
"num_tokens": 3044204.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"entropy": 6.264136171340942,
|
|
"epoch": 0.13904641881957572,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999659054019066,
|
|
"loss": 6.2747,
|
|
"mean_token_accuracy": 0.1242525890469551,
|
|
"num_tokens": 3053111.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"entropy": 6.191974449157715,
|
|
"epoch": 0.1394664986347406,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999653820983673,
|
|
"loss": 6.1818,
|
|
"mean_token_accuracy": 0.12419796586036683,
|
|
"num_tokens": 3062456.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"entropy": 6.188469123840332,
|
|
"epoch": 0.13988657844990549,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499964854809682,
|
|
"loss": 6.2114,
|
|
"mean_token_accuracy": 0.12520652115345002,
|
|
"num_tokens": 3071132.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"entropy": 6.217535066604614,
|
|
"epoch": 0.14030665826507036,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999643235358602,
|
|
"loss": 6.1733,
|
|
"mean_token_accuracy": 0.12733130380511284,
|
|
"num_tokens": 3080892.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"entropy": 6.150455570220947,
|
|
"epoch": 0.14072673808023525,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999637882769112,
|
|
"loss": 6.1088,
|
|
"mean_token_accuracy": 0.13008806556463243,
|
|
"num_tokens": 3089874.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"entropy": 6.232478332519531,
|
|
"epoch": 0.14114681789540012,
|
|
"grad_norm": 0.96484375,
|
|
"learning_rate": 0.0004999632490328447,
|
|
"loss": 6.2504,
|
|
"mean_token_accuracy": 0.12480302304029464,
|
|
"num_tokens": 3099535.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"entropy": 6.220491170883179,
|
|
"epoch": 0.14156689771056502,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999627058036699,
|
|
"loss": 6.1932,
|
|
"mean_token_accuracy": 0.12512605339288713,
|
|
"num_tokens": 3108772.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"entropy": 6.254866027832032,
|
|
"epoch": 0.1419869775257299,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999621585893966,
|
|
"loss": 6.2305,
|
|
"mean_token_accuracy": 0.11818314492702484,
|
|
"num_tokens": 3118333.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"entropy": 6.258799934387207,
|
|
"epoch": 0.14240705734089476,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999616073900346,
|
|
"loss": 6.2544,
|
|
"mean_token_accuracy": 0.1175099603831768,
|
|
"num_tokens": 3127356.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"entropy": 6.158872365951538,
|
|
"epoch": 0.14282713715605966,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999610522055935,
|
|
"loss": 6.2288,
|
|
"mean_token_accuracy": 0.12072905600070953,
|
|
"num_tokens": 3136859.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"entropy": 6.2665447235107425,
|
|
"epoch": 0.14324721697122453,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999604930360832,
|
|
"loss": 6.2553,
|
|
"mean_token_accuracy": 0.11907806620001793,
|
|
"num_tokens": 3146607.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"entropy": 6.134842443466186,
|
|
"epoch": 0.14366729678638943,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004999599298815136,
|
|
"loss": 6.1946,
|
|
"mean_token_accuracy": 0.12945861145853996,
|
|
"num_tokens": 3156327.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"entropy": 6.1354063034057615,
|
|
"epoch": 0.1440873766015543,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004999593627418947,
|
|
"loss": 6.1466,
|
|
"mean_token_accuracy": 0.13011169731616973,
|
|
"num_tokens": 3165559.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"entropy": 6.2760594367980955,
|
|
"epoch": 0.14450745641671917,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999587916172365,
|
|
"loss": 6.247,
|
|
"mean_token_accuracy": 0.11565925851464272,
|
|
"num_tokens": 3173850.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"entropy": 6.219358253479004,
|
|
"epoch": 0.14492753623188406,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999582165075492,
|
|
"loss": 6.1819,
|
|
"mean_token_accuracy": 0.12041235193610192,
|
|
"num_tokens": 3182838.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"entropy": 6.098177671432495,
|
|
"epoch": 0.14534761604704893,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999576374128429,
|
|
"loss": 6.1848,
|
|
"mean_token_accuracy": 0.12102061733603478,
|
|
"num_tokens": 3191692.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"entropy": 6.281035900115967,
|
|
"epoch": 0.14576769586221383,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999570543331279,
|
|
"loss": 6.2096,
|
|
"mean_token_accuracy": 0.12320142686367035,
|
|
"num_tokens": 3200069.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"entropy": 6.174470567703247,
|
|
"epoch": 0.1461877756773787,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999564672684145,
|
|
"loss": 6.2813,
|
|
"mean_token_accuracy": 0.11924844831228257,
|
|
"num_tokens": 3209653.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"entropy": 6.267424774169922,
|
|
"epoch": 0.14660785549254357,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999558762187131,
|
|
"loss": 6.1641,
|
|
"mean_token_accuracy": 0.1311913624405861,
|
|
"num_tokens": 3218313.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"entropy": 6.102459383010864,
|
|
"epoch": 0.14702793530770847,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999552811840342,
|
|
"loss": 6.0922,
|
|
"mean_token_accuracy": 0.12970734164118766,
|
|
"num_tokens": 3227525.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"entropy": 6.164451694488525,
|
|
"epoch": 0.14744801512287334,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999546821643884,
|
|
"loss": 6.2098,
|
|
"mean_token_accuracy": 0.12783958092331887,
|
|
"num_tokens": 3237022.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"entropy": 6.150622081756592,
|
|
"epoch": 0.14786809493803824,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999540791597861,
|
|
"loss": 6.1154,
|
|
"mean_token_accuracy": 0.1278826355934143,
|
|
"num_tokens": 3246605.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"entropy": 6.106969261169434,
|
|
"epoch": 0.1482881747532031,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999534721702383,
|
|
"loss": 6.065,
|
|
"mean_token_accuracy": 0.13145939782261848,
|
|
"num_tokens": 3255587.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"entropy": 6.141215419769287,
|
|
"epoch": 0.148708254568368,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999528611957553,
|
|
"loss": 6.1666,
|
|
"mean_token_accuracy": 0.1264194056391716,
|
|
"num_tokens": 3265669.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"entropy": 6.255412673950195,
|
|
"epoch": 0.14912833438353287,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999522462363485,
|
|
"loss": 6.1518,
|
|
"mean_token_accuracy": 0.13116262778639792,
|
|
"num_tokens": 3275013.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"entropy": 6.1518439769744875,
|
|
"epoch": 0.14954841419869774,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999516272920283,
|
|
"loss": 6.255,
|
|
"mean_token_accuracy": 0.1255984991788864,
|
|
"num_tokens": 3284723.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"entropy": 6.11306095123291,
|
|
"epoch": 0.14996849401386264,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499951004362806,
|
|
"loss": 6.0833,
|
|
"mean_token_accuracy": 0.12718486189842224,
|
|
"num_tokens": 3293860.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"entropy": 6.046157026290894,
|
|
"epoch": 0.1503885738290275,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999503774486924,
|
|
"loss": 6.1405,
|
|
"mean_token_accuracy": 0.1226385310292244,
|
|
"num_tokens": 3303158.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"entropy": 6.138220262527466,
|
|
"epoch": 0.1508086536441924,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999497465496987,
|
|
"loss": 6.0637,
|
|
"mean_token_accuracy": 0.12298208549618721,
|
|
"num_tokens": 3313068.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"entropy": 6.1492797374725345,
|
|
"epoch": 0.15122873345935728,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499949111665836,
|
|
"loss": 6.1591,
|
|
"mean_token_accuracy": 0.12638431563973426,
|
|
"num_tokens": 3321885.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"entropy": 6.2002543926239015,
|
|
"epoch": 0.15164881327452215,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999484727971158,
|
|
"loss": 6.1371,
|
|
"mean_token_accuracy": 0.12693015187978746,
|
|
"num_tokens": 3330924.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"entropy": 6.14166145324707,
|
|
"epoch": 0.15206889308968705,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499947829943549,
|
|
"loss": 6.176,
|
|
"mean_token_accuracy": 0.12311631590127944,
|
|
"num_tokens": 3340070.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"entropy": 6.207650995254516,
|
|
"epoch": 0.15248897290485192,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999471831051474,
|
|
"loss": 6.1689,
|
|
"mean_token_accuracy": 0.13365908414125444,
|
|
"num_tokens": 3349870.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"entropy": 6.160492658615112,
|
|
"epoch": 0.1529090527200168,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999465322819222,
|
|
"loss": 6.2169,
|
|
"mean_token_accuracy": 0.12209457084536553,
|
|
"num_tokens": 3359573.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"entropy": 6.222156381607055,
|
|
"epoch": 0.15332913253518168,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999458774738851,
|
|
"loss": 6.1684,
|
|
"mean_token_accuracy": 0.13460491448640824,
|
|
"num_tokens": 3368577.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"entropy": 6.116939735412598,
|
|
"epoch": 0.15374921235034655,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999452186810476,
|
|
"loss": 6.1162,
|
|
"mean_token_accuracy": 0.13111207485198975,
|
|
"num_tokens": 3377801.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"entropy": 6.230794095993042,
|
|
"epoch": 0.15416929216551145,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004999445559034214,
|
|
"loss": 6.1624,
|
|
"mean_token_accuracy": 0.12796897292137147,
|
|
"num_tokens": 3386666.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"entropy": 6.314156770706177,
|
|
"epoch": 0.15458937198067632,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999438891410181,
|
|
"loss": 6.3117,
|
|
"mean_token_accuracy": 0.12008170932531356,
|
|
"num_tokens": 3396086.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"entropy": 6.129251384735108,
|
|
"epoch": 0.15500945179584122,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004999432183938496,
|
|
"loss": 6.2244,
|
|
"mean_token_accuracy": 0.13115857616066934,
|
|
"num_tokens": 3404894.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"entropy": 6.144184207916259,
|
|
"epoch": 0.1554295316110061,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999425436619279,
|
|
"loss": 6.2119,
|
|
"mean_token_accuracy": 0.12016721740365029,
|
|
"num_tokens": 3414172.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"entropy": 6.27680606842041,
|
|
"epoch": 0.15584961142617096,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.000499941864945265,
|
|
"loss": 6.1711,
|
|
"mean_token_accuracy": 0.12068818733096123,
|
|
"num_tokens": 3423409.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"entropy": 6.115350151062012,
|
|
"epoch": 0.15626969124133586,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999411822438726,
|
|
"loss": 6.1448,
|
|
"mean_token_accuracy": 0.12664692029356955,
|
|
"num_tokens": 3433047.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"entropy": 6.1765364646911625,
|
|
"epoch": 0.15668977105650073,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499940495557763,
|
|
"loss": 6.1229,
|
|
"mean_token_accuracy": 0.12646402716636657,
|
|
"num_tokens": 3442490.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"entropy": 6.212595748901367,
|
|
"epoch": 0.15710985087166562,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999398048869485,
|
|
"loss": 6.1955,
|
|
"mean_token_accuracy": 0.12634197920560836,
|
|
"num_tokens": 3451804.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"entropy": 6.19734468460083,
|
|
"epoch": 0.1575299306868305,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.000499939110231441,
|
|
"loss": 6.1767,
|
|
"mean_token_accuracy": 0.1279009036719799,
|
|
"num_tokens": 3461481.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"entropy": 6.233086156845093,
|
|
"epoch": 0.1579500105019954,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004999384115912531,
|
|
"loss": 6.2344,
|
|
"mean_token_accuracy": 0.12624539732933043,
|
|
"num_tokens": 3471798.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"entropy": 6.015159845352173,
|
|
"epoch": 0.15837009031716026,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.000499937708966397,
|
|
"loss": 6.1259,
|
|
"mean_token_accuracy": 0.12552440091967582,
|
|
"num_tokens": 3481386.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"entropy": 6.193439149856568,
|
|
"epoch": 0.15879017013232513,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999370023568853,
|
|
"loss": 6.1184,
|
|
"mean_token_accuracy": 0.12572901472449302,
|
|
"num_tokens": 3489981.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"entropy": 6.112298202514649,
|
|
"epoch": 0.15921024994749003,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999362917627304,
|
|
"loss": 6.0882,
|
|
"mean_token_accuracy": 0.1290791854262352,
|
|
"num_tokens": 3498551.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"entropy": 6.154214668273926,
|
|
"epoch": 0.1596303297626549,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004999355771839448,
|
|
"loss": 6.0678,
|
|
"mean_token_accuracy": 0.13199156373739243,
|
|
"num_tokens": 3507921.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"entropy": 6.25932993888855,
|
|
"epoch": 0.1600504095778198,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004999348586205414,
|
|
"loss": 6.2391,
|
|
"mean_token_accuracy": 0.12772736102342605,
|
|
"num_tokens": 3517570.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"entropy": 6.199037790298462,
|
|
"epoch": 0.16047048939298467,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999341360725327,
|
|
"loss": 6.2333,
|
|
"mean_token_accuracy": 0.12291170060634612,
|
|
"num_tokens": 3526774.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"entropy": 6.160675954818726,
|
|
"epoch": 0.16089056920814954,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999334095399317,
|
|
"loss": 6.1636,
|
|
"mean_token_accuracy": 0.13550060987472534,
|
|
"num_tokens": 3535319.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"entropy": 6.045867156982422,
|
|
"epoch": 0.16131064902331443,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999326790227512,
|
|
"loss": 6.1319,
|
|
"mean_token_accuracy": 0.13111512288451194,
|
|
"num_tokens": 3544468.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"entropy": 6.046793222427368,
|
|
"epoch": 0.1617307288384793,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004999319445210041,
|
|
"loss": 6.0122,
|
|
"mean_token_accuracy": 0.13604443669319152,
|
|
"num_tokens": 3553529.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"entropy": 6.111843824386597,
|
|
"epoch": 0.1621508086536442,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999312060347034,
|
|
"loss": 6.0683,
|
|
"mean_token_accuracy": 0.12906411960721015,
|
|
"num_tokens": 3563053.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"entropy": 6.050889587402343,
|
|
"epoch": 0.16257088846880907,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999304635638621,
|
|
"loss": 6.0231,
|
|
"mean_token_accuracy": 0.13277052119374275,
|
|
"num_tokens": 3571877.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"entropy": 6.067962026596069,
|
|
"epoch": 0.16299096828397394,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999297171084935,
|
|
"loss": 6.0714,
|
|
"mean_token_accuracy": 0.1285587027668953,
|
|
"num_tokens": 3581496.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"entropy": 6.136378002166748,
|
|
"epoch": 0.16341104809913884,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999289666686109,
|
|
"loss": 6.0886,
|
|
"mean_token_accuracy": 0.12886847257614137,
|
|
"num_tokens": 3590752.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"entropy": 6.010327911376953,
|
|
"epoch": 0.1638311279143037,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004999282122442274,
|
|
"loss": 6.0834,
|
|
"mean_token_accuracy": 0.13124383464455605,
|
|
"num_tokens": 3599885.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"entropy": 6.264139842987061,
|
|
"epoch": 0.1642512077294686,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004999274538353564,
|
|
"loss": 6.1701,
|
|
"mean_token_accuracy": 0.12521142810583114,
|
|
"num_tokens": 3610039.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"entropy": 6.073106908798218,
|
|
"epoch": 0.16467128754463348,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999266914420114,
|
|
"loss": 6.1051,
|
|
"mean_token_accuracy": 0.12478168606758118,
|
|
"num_tokens": 3619954.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"entropy": 6.187624120712281,
|
|
"epoch": 0.16509136735979837,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499925925064206,
|
|
"loss": 6.0438,
|
|
"mean_token_accuracy": 0.13393656834959983,
|
|
"num_tokens": 3628164.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"entropy": 6.205664491653442,
|
|
"epoch": 0.16551144717496324,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004999251547019535,
|
|
"loss": 6.1949,
|
|
"mean_token_accuracy": 0.13081590309739113,
|
|
"num_tokens": 3636778.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"entropy": 6.189227771759033,
|
|
"epoch": 0.16593152699012811,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004999243803552678,
|
|
"loss": 6.1562,
|
|
"mean_token_accuracy": 0.1314916841685772,
|
|
"num_tokens": 3647046.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"entropy": 6.080912733078003,
|
|
"epoch": 0.166351606805293,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999236020241625,
|
|
"loss": 6.0778,
|
|
"mean_token_accuracy": 0.12676682993769645,
|
|
"num_tokens": 3656130.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"entropy": 6.122231149673462,
|
|
"epoch": 0.16677168662045788,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004999228197086514,
|
|
"loss": 6.1524,
|
|
"mean_token_accuracy": 0.12139208093285561,
|
|
"num_tokens": 3666145.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"entropy": 6.180514907836914,
|
|
"epoch": 0.16719176643562278,
|
|
"grad_norm": 0.95703125,
|
|
"learning_rate": 0.0004999220334087484,
|
|
"loss": 6.1973,
|
|
"mean_token_accuracy": 0.12297596782445908,
|
|
"num_tokens": 3676722.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"entropy": 6.160478258132935,
|
|
"epoch": 0.16761184625078765,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999212431244673,
|
|
"loss": 6.1951,
|
|
"mean_token_accuracy": 0.12390567734837532,
|
|
"num_tokens": 3685880.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"entropy": 6.056979942321777,
|
|
"epoch": 0.16803192606595252,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004999204488558222,
|
|
"loss": 6.0309,
|
|
"mean_token_accuracy": 0.13338653817772866,
|
|
"num_tokens": 3695167.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"entropy": 6.132491636276245,
|
|
"epoch": 0.16845200588111742,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004999196506028273,
|
|
"loss": 6.1254,
|
|
"mean_token_accuracy": 0.12816390842199327,
|
|
"num_tokens": 3703700.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"entropy": 6.12127537727356,
|
|
"epoch": 0.1688720856962823,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999188483654965,
|
|
"loss": 6.0676,
|
|
"mean_token_accuracy": 0.13127523884177209,
|
|
"num_tokens": 3712825.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"entropy": 6.0288161277771,
|
|
"epoch": 0.16929216551144718,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004999180421438442,
|
|
"loss": 6.0433,
|
|
"mean_token_accuracy": 0.13027536049485205,
|
|
"num_tokens": 3721807.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"entropy": 6.227120208740234,
|
|
"epoch": 0.16971224532661205,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004999172319378846,
|
|
"loss": 6.215,
|
|
"mean_token_accuracy": 0.12012537494301796,
|
|
"num_tokens": 3730502.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"entropy": 6.142453145980835,
|
|
"epoch": 0.17013232514177692,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999164177476319,
|
|
"loss": 6.1007,
|
|
"mean_token_accuracy": 0.13319698646664618,
|
|
"num_tokens": 3739696.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"entropy": 5.983246898651123,
|
|
"epoch": 0.17055240495694182,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999155995731009,
|
|
"loss": 6.0931,
|
|
"mean_token_accuracy": 0.12937767952680587,
|
|
"num_tokens": 3748675.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"entropy": 6.279529285430908,
|
|
"epoch": 0.1709724847721067,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999147774143057,
|
|
"loss": 6.1818,
|
|
"mean_token_accuracy": 0.12119651660323143,
|
|
"num_tokens": 3757714.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"entropy": 6.021195363998413,
|
|
"epoch": 0.1713925645872716,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499913951271261,
|
|
"loss": 5.9995,
|
|
"mean_token_accuracy": 0.13538317754864693,
|
|
"num_tokens": 3767589.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"entropy": 6.069708347320557,
|
|
"epoch": 0.17181264440243646,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004999131211439816,
|
|
"loss": 6.1103,
|
|
"mean_token_accuracy": 0.1307745970785618,
|
|
"num_tokens": 3777261.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"entropy": 6.199843311309815,
|
|
"epoch": 0.17223272421760136,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.000499912287032482,
|
|
"loss": 6.0616,
|
|
"mean_token_accuracy": 0.138626891374588,
|
|
"num_tokens": 3786658.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"entropy": 5.944658851623535,
|
|
"epoch": 0.17265280403276623,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499911448936777,
|
|
"loss": 6.0687,
|
|
"mean_token_accuracy": 0.13718581050634385,
|
|
"num_tokens": 3794977.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"entropy": 6.036355209350586,
|
|
"epoch": 0.1730728838479311,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999106068568816,
|
|
"loss": 6.1519,
|
|
"mean_token_accuracy": 0.12767373770475388,
|
|
"num_tokens": 3805138.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"entropy": 6.216224002838135,
|
|
"epoch": 0.173492963663096,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004999097607928106,
|
|
"loss": 6.0775,
|
|
"mean_token_accuracy": 0.13682154789566994,
|
|
"num_tokens": 3814444.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"entropy": 6.071376657485962,
|
|
"epoch": 0.17391304347826086,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004999089107445788,
|
|
"loss": 6.0367,
|
|
"mean_token_accuracy": 0.12884577140212058,
|
|
"num_tokens": 3822859.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"entropy": 6.008500671386718,
|
|
"epoch": 0.17433312329342576,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.0004999080567122016,
|
|
"loss": 6.0618,
|
|
"mean_token_accuracy": 0.13177087977528573,
|
|
"num_tokens": 3833159.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"entropy": 6.108531808853149,
|
|
"epoch": 0.17475320310859063,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004999071986956941,
|
|
"loss": 6.0712,
|
|
"mean_token_accuracy": 0.132802564650774,
|
|
"num_tokens": 3842136.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"entropy": 6.1049566745758055,
|
|
"epoch": 0.1751732829237555,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999063366950713,
|
|
"loss": 6.1334,
|
|
"mean_token_accuracy": 0.12718930542469026,
|
|
"num_tokens": 3851406.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"entropy": 6.059695482254028,
|
|
"epoch": 0.1755933627389204,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004999054707103486,
|
|
"loss": 6.0576,
|
|
"mean_token_accuracy": 0.13101131469011307,
|
|
"num_tokens": 3861061.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"entropy": 6.151092004776001,
|
|
"epoch": 0.17601344255408527,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004999046007415412,
|
|
"loss": 6.0408,
|
|
"mean_token_accuracy": 0.12858548611402512,
|
|
"num_tokens": 3870357.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"entropy": 6.117391204833984,
|
|
"epoch": 0.17643352236925017,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999037267886646,
|
|
"loss": 6.0479,
|
|
"mean_token_accuracy": 0.1312727876007557,
|
|
"num_tokens": 3879393.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"entropy": 6.020799350738526,
|
|
"epoch": 0.17685360218441504,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004999028488517343,
|
|
"loss": 6.0773,
|
|
"mean_token_accuracy": 0.1310334399342537,
|
|
"num_tokens": 3888030.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"entropy": 6.100592184066772,
|
|
"epoch": 0.1772736819995799,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004999019669307659,
|
|
"loss": 6.0848,
|
|
"mean_token_accuracy": 0.13095394000411034,
|
|
"num_tokens": 3897430.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"entropy": 6.128114986419678,
|
|
"epoch": 0.1776937618147448,
|
|
"grad_norm": 1.0,
|
|
"learning_rate": 0.0004999010810257749,
|
|
"loss": 6.1085,
|
|
"mean_token_accuracy": 0.1252062700688839,
|
|
"num_tokens": 3907711.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"entropy": 6.000101137161255,
|
|
"epoch": 0.17811384162990967,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004999001911367771,
|
|
"loss": 6.0329,
|
|
"mean_token_accuracy": 0.13599146604537965,
|
|
"num_tokens": 3915816.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"entropy": 6.115037250518799,
|
|
"epoch": 0.17853392144507457,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998992972637883,
|
|
"loss": 6.1562,
|
|
"mean_token_accuracy": 0.12152478694915772,
|
|
"num_tokens": 3925162.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"entropy": 6.161215543746948,
|
|
"epoch": 0.17895400126023944,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998983994068242,
|
|
"loss": 6.0325,
|
|
"mean_token_accuracy": 0.13381896317005157,
|
|
"num_tokens": 3934476.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"entropy": 6.025839996337891,
|
|
"epoch": 0.17937408107540434,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004998974975659006,
|
|
"loss": 6.0854,
|
|
"mean_token_accuracy": 0.1257789060473442,
|
|
"num_tokens": 3943501.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"entropy": 6.134726524353027,
|
|
"epoch": 0.1797941608905692,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998965917410338,
|
|
"loss": 6.0793,
|
|
"mean_token_accuracy": 0.12519558221101762,
|
|
"num_tokens": 3953663.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"entropy": 6.026215839385986,
|
|
"epoch": 0.18021424070573408,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004998956819322397,
|
|
"loss": 6.0415,
|
|
"mean_token_accuracy": 0.1311854176223278,
|
|
"num_tokens": 3962634.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"entropy": 6.095232534408569,
|
|
"epoch": 0.18063432052089898,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998947681395343,
|
|
"loss": 6.0476,
|
|
"mean_token_accuracy": 0.13477237224578859,
|
|
"num_tokens": 3972496.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"entropy": 6.216877603530884,
|
|
"epoch": 0.18105440033606385,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.000499893850362934,
|
|
"loss": 6.294,
|
|
"mean_token_accuracy": 0.12388142496347428,
|
|
"num_tokens": 3980724.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"entropy": 6.148508739471436,
|
|
"epoch": 0.18147448015122875,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998929286024548,
|
|
"loss": 6.1193,
|
|
"mean_token_accuracy": 0.12675963416695596,
|
|
"num_tokens": 3989842.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"entropy": 6.089916467666626,
|
|
"epoch": 0.18189455996639362,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004998920028581133,
|
|
"loss": 6.0396,
|
|
"mean_token_accuracy": 0.13960086852312087,
|
|
"num_tokens": 3998534.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"entropy": 6.071965646743775,
|
|
"epoch": 0.18231463978155849,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004998910731299258,
|
|
"loss": 6.066,
|
|
"mean_token_accuracy": 0.12839170619845391,
|
|
"num_tokens": 4007677.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"entropy": 6.075281476974487,
|
|
"epoch": 0.18273471959672338,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998901394179085,
|
|
"loss": 6.1252,
|
|
"mean_token_accuracy": 0.1271742030978203,
|
|
"num_tokens": 4016347.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"entropy": 6.099024391174316,
|
|
"epoch": 0.18315479941188825,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998892017220784,
|
|
"loss": 5.981,
|
|
"mean_token_accuracy": 0.13410005643963813,
|
|
"num_tokens": 4025199.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"entropy": 6.039233541488647,
|
|
"epoch": 0.18357487922705315,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004998882600424519,
|
|
"loss": 6.0482,
|
|
"mean_token_accuracy": 0.12497256994247437,
|
|
"num_tokens": 4033933.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"entropy": 6.077206420898437,
|
|
"epoch": 0.18399495904221802,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004998873143790455,
|
|
"loss": 5.9828,
|
|
"mean_token_accuracy": 0.14159614518284797,
|
|
"num_tokens": 4042891.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"entropy": 6.073574495315552,
|
|
"epoch": 0.1844150388573829,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998863647318763,
|
|
"loss": 6.0991,
|
|
"mean_token_accuracy": 0.12677306681871414,
|
|
"num_tokens": 4051123.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"entropy": 6.040490913391113,
|
|
"epoch": 0.1848351186725478,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998854111009608,
|
|
"loss": 6.0708,
|
|
"mean_token_accuracy": 0.12666793614625932,
|
|
"num_tokens": 4060025.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"entropy": 6.00008749961853,
|
|
"epoch": 0.18525519848771266,
|
|
"grad_norm": 0.98046875,
|
|
"learning_rate": 0.0004998844534863161,
|
|
"loss": 5.9771,
|
|
"mean_token_accuracy": 0.12744748294353486,
|
|
"num_tokens": 4069363.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"entropy": 6.136739826202392,
|
|
"epoch": 0.18567527830287756,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004998834918879592,
|
|
"loss": 6.1326,
|
|
"mean_token_accuracy": 0.1333842933177948,
|
|
"num_tokens": 4078855.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"entropy": 6.1249613761901855,
|
|
"epoch": 0.18609535811804243,
|
|
"grad_norm": 0.9921875,
|
|
"learning_rate": 0.000499882526305907,
|
|
"loss": 6.0922,
|
|
"mean_token_accuracy": 0.1307423233985901,
|
|
"num_tokens": 4087801.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"entropy": 6.088332986831665,
|
|
"epoch": 0.18651543793320732,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998815567401765,
|
|
"loss": 6.1042,
|
|
"mean_token_accuracy": 0.1286585159599781,
|
|
"num_tokens": 4096949.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"entropy": 6.1469251155853275,
|
|
"epoch": 0.1869355177483722,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998805831907851,
|
|
"loss": 6.0481,
|
|
"mean_token_accuracy": 0.13261425495147705,
|
|
"num_tokens": 4105399.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"entropy": 6.030412244796753,
|
|
"epoch": 0.18735559756353706,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998796056577501,
|
|
"loss": 6.0131,
|
|
"mean_token_accuracy": 0.12757696062326432,
|
|
"num_tokens": 4113873.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"entropy": 6.031377696990967,
|
|
"epoch": 0.18777567737870196,
|
|
"grad_norm": 1.015625,
|
|
"learning_rate": 0.0004998786241410886,
|
|
"loss": 6.0712,
|
|
"mean_token_accuracy": 0.13365804105997087,
|
|
"num_tokens": 4123528.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"entropy": 6.171572303771972,
|
|
"epoch": 0.18819575719386683,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.000499877638640818,
|
|
"loss": 6.0658,
|
|
"mean_token_accuracy": 0.1285923771560192,
|
|
"num_tokens": 4133370.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"entropy": 5.986340188980103,
|
|
"epoch": 0.18861583700903173,
|
|
"grad_norm": 1.0234375,
|
|
"learning_rate": 0.000499876649156956,
|
|
"loss": 5.9815,
|
|
"mean_token_accuracy": 0.13429070338606836,
|
|
"num_tokens": 4142370.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"entropy": 6.028431034088134,
|
|
"epoch": 0.1890359168241966,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998756556895196,
|
|
"loss": 6.0667,
|
|
"mean_token_accuracy": 0.13125480934977532,
|
|
"num_tokens": 4152367.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"entropy": 6.113050174713135,
|
|
"epoch": 0.18945599663936147,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499874658238527,
|
|
"loss": 6.0648,
|
|
"mean_token_accuracy": 0.1346374697983265,
|
|
"num_tokens": 4161126.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"entropy": 6.028095388412476,
|
|
"epoch": 0.18987607645452637,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998736568039957,
|
|
"loss": 5.96,
|
|
"mean_token_accuracy": 0.13207067623734475,
|
|
"num_tokens": 4169910.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"entropy": 6.0694433689117435,
|
|
"epoch": 0.19029615626969124,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998726513859432,
|
|
"loss": 6.1159,
|
|
"mean_token_accuracy": 0.12696689143776893,
|
|
"num_tokens": 4179893.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"entropy": 6.183452177047729,
|
|
"epoch": 0.19071623608485613,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998716419843875,
|
|
"loss": 6.1192,
|
|
"mean_token_accuracy": 0.1379365175962448,
|
|
"num_tokens": 4190065.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"entropy": 5.935803985595703,
|
|
"epoch": 0.191136315900021,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998706285993465,
|
|
"loss": 6.0341,
|
|
"mean_token_accuracy": 0.13346357494592667,
|
|
"num_tokens": 4198395.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"entropy": 6.13513503074646,
|
|
"epoch": 0.19155639571518587,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998696112308381,
|
|
"loss": 6.066,
|
|
"mean_token_accuracy": 0.1285228006541729,
|
|
"num_tokens": 4207555.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"entropy": 5.988098859786987,
|
|
"epoch": 0.19197647553035077,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004998685898788803,
|
|
"loss": 6.0031,
|
|
"mean_token_accuracy": 0.13245714083313942,
|
|
"num_tokens": 4216533.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"entropy": 6.156089878082275,
|
|
"epoch": 0.19239655534551564,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004998675645434914,
|
|
"loss": 6.1082,
|
|
"mean_token_accuracy": 0.13477368876338006,
|
|
"num_tokens": 4225575.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"entropy": 5.991475677490234,
|
|
"epoch": 0.19281663516068054,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998665352246891,
|
|
"loss": 5.8834,
|
|
"mean_token_accuracy": 0.1395990490913391,
|
|
"num_tokens": 4234306.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"entropy": 5.991662073135376,
|
|
"epoch": 0.1932367149758454,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998655019224921,
|
|
"loss": 6.0833,
|
|
"mean_token_accuracy": 0.13282228037714958,
|
|
"num_tokens": 4243998.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"entropy": 6.109613370895386,
|
|
"epoch": 0.19365679479101028,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998644646369185,
|
|
"loss": 5.9798,
|
|
"mean_token_accuracy": 0.1297495998442173,
|
|
"num_tokens": 4253653.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"entropy": 5.980562829971314,
|
|
"epoch": 0.19407687460617518,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998634233679865,
|
|
"loss": 6.0677,
|
|
"mean_token_accuracy": 0.12498517185449601,
|
|
"num_tokens": 4263305.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"entropy": 6.036490631103516,
|
|
"epoch": 0.19449695442134005,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.000499862378115715,
|
|
"loss": 5.942,
|
|
"mean_token_accuracy": 0.13776571899652482,
|
|
"num_tokens": 4272212.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"entropy": 6.152922439575195,
|
|
"epoch": 0.19491703423650494,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998613288801221,
|
|
"loss": 6.1425,
|
|
"mean_token_accuracy": 0.12971725761890412,
|
|
"num_tokens": 4281445.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"entropy": 6.123716592788696,
|
|
"epoch": 0.1953371140516698,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998602756612267,
|
|
"loss": 6.0573,
|
|
"mean_token_accuracy": 0.13039504289627074,
|
|
"num_tokens": 4290938.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"entropy": 6.004360866546631,
|
|
"epoch": 0.1957571938668347,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998592184590471,
|
|
"loss": 6.0764,
|
|
"mean_token_accuracy": 0.13114980682730676,
|
|
"num_tokens": 4300022.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"entropy": 5.995278835296631,
|
|
"epoch": 0.19617727368199958,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998581572736024,
|
|
"loss": 5.9693,
|
|
"mean_token_accuracy": 0.1386754259467125,
|
|
"num_tokens": 4308910.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"entropy": 5.990830326080323,
|
|
"epoch": 0.19659735349716445,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998570921049112,
|
|
"loss": 5.964,
|
|
"mean_token_accuracy": 0.13531014919281006,
|
|
"num_tokens": 4317136.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"entropy": 6.019982814788818,
|
|
"epoch": 0.19701743331232935,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998560229529924,
|
|
"loss": 6.0043,
|
|
"mean_token_accuracy": 0.13840724304318427,
|
|
"num_tokens": 4326163.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"entropy": 6.181583261489868,
|
|
"epoch": 0.19743751312749422,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998549498178649,
|
|
"loss": 6.1515,
|
|
"mean_token_accuracy": 0.13239141255617143,
|
|
"num_tokens": 4335837.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"entropy": 6.1028111457824705,
|
|
"epoch": 0.19785759294265912,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998538726995477,
|
|
"loss": 6.0502,
|
|
"mean_token_accuracy": 0.13465801179409026,
|
|
"num_tokens": 4345108.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"entropy": 6.11204948425293,
|
|
"epoch": 0.198277672757824,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.00049985279159806,
|
|
"loss": 6.0904,
|
|
"mean_token_accuracy": 0.13041200041770934,
|
|
"num_tokens": 4353761.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"entropy": 6.025763607025146,
|
|
"epoch": 0.19869775257298886,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998517065134208,
|
|
"loss": 6.0492,
|
|
"mean_token_accuracy": 0.1321948856115341,
|
|
"num_tokens": 4363244.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"entropy": 6.045213079452514,
|
|
"epoch": 0.19911783238815375,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998506174456494,
|
|
"loss": 6.0414,
|
|
"mean_token_accuracy": 0.1313652828335762,
|
|
"num_tokens": 4373034.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"entropy": 6.018647909164429,
|
|
"epoch": 0.19953791220331862,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998495243947653,
|
|
"loss": 5.9888,
|
|
"mean_token_accuracy": 0.12499892637133599,
|
|
"num_tokens": 4382554.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"entropy": 6.050378942489624,
|
|
"epoch": 0.19995799201848352,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998484273607875,
|
|
"loss": 6.0109,
|
|
"mean_token_accuracy": 0.13378100991249084,
|
|
"num_tokens": 4391001.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"entropy": 5.911934089660645,
|
|
"epoch": 0.2003780718336484,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998473263437356,
|
|
"loss": 5.9179,
|
|
"mean_token_accuracy": 0.13600271940231323,
|
|
"num_tokens": 4400632.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"entropy": 6.02356653213501,
|
|
"epoch": 0.20079815164881326,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499846221343629,
|
|
"loss": 6.0212,
|
|
"mean_token_accuracy": 0.12660589516162873,
|
|
"num_tokens": 4409565.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"entropy": 6.01599760055542,
|
|
"epoch": 0.20121823146397816,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998451123604875,
|
|
"loss": 5.9683,
|
|
"mean_token_accuracy": 0.13648212924599648,
|
|
"num_tokens": 4418384.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"entropy": 6.052209377288818,
|
|
"epoch": 0.20163831127914303,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004998439993943306,
|
|
"loss": 6.0768,
|
|
"mean_token_accuracy": 0.13455061092972756,
|
|
"num_tokens": 4427581.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"entropy": 6.126885080337525,
|
|
"epoch": 0.20205839109430793,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998428824451779,
|
|
"loss": 6.0655,
|
|
"mean_token_accuracy": 0.12827305421233176,
|
|
"num_tokens": 4436572.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"entropy": 6.031488513946533,
|
|
"epoch": 0.2024784709094728,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004998417615130495,
|
|
"loss": 6.0686,
|
|
"mean_token_accuracy": 0.13068403899669648,
|
|
"num_tokens": 4445230.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"entropy": 6.176716995239258,
|
|
"epoch": 0.2028985507246377,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004998406365979649,
|
|
"loss": 6.1411,
|
|
"mean_token_accuracy": 0.13211808502674102,
|
|
"num_tokens": 4454251.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"entropy": 6.0446735382080075,
|
|
"epoch": 0.20331863053980256,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998395076999443,
|
|
"loss": 5.9835,
|
|
"mean_token_accuracy": 0.13375458046793937,
|
|
"num_tokens": 4463949.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"entropy": 6.100032329559326,
|
|
"epoch": 0.20373871035496743,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998383748190076,
|
|
"loss": 6.1638,
|
|
"mean_token_accuracy": 0.12677136287093163,
|
|
"num_tokens": 4473373.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"entropy": 6.1406354904174805,
|
|
"epoch": 0.20415879017013233,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004998372379551748,
|
|
"loss": 6.0192,
|
|
"mean_token_accuracy": 0.1297541990876198,
|
|
"num_tokens": 4482303.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"entropy": 5.960326719284057,
|
|
"epoch": 0.2045788699852972,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998360971084663,
|
|
"loss": 5.9691,
|
|
"mean_token_accuracy": 0.13114270120859145,
|
|
"num_tokens": 4491214.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"entropy": 5.993530750274658,
|
|
"epoch": 0.2049989498004621,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004998349522789019,
|
|
"loss": 5.8971,
|
|
"mean_token_accuracy": 0.13634659722447395,
|
|
"num_tokens": 4500099.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"entropy": 5.949355411529541,
|
|
"epoch": 0.20541902961562697,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004998338034665021,
|
|
"loss": 5.9773,
|
|
"mean_token_accuracy": 0.13802963197231294,
|
|
"num_tokens": 4509893.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"entropy": 6.010091781616211,
|
|
"epoch": 0.20583910943079184,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998326506712872,
|
|
"loss": 5.9481,
|
|
"mean_token_accuracy": 0.1345847800374031,
|
|
"num_tokens": 4518606.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"entropy": 6.054938316345215,
|
|
"epoch": 0.20625918924595674,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998314938932778,
|
|
"loss": 6.0368,
|
|
"mean_token_accuracy": 0.13336761966347693,
|
|
"num_tokens": 4528392.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"entropy": 6.06166090965271,
|
|
"epoch": 0.2066792690611216,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004998303331324943,
|
|
"loss": 6.0068,
|
|
"mean_token_accuracy": 0.13653545677661896,
|
|
"num_tokens": 4536983.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"entropy": 5.900071907043457,
|
|
"epoch": 0.2070993488762865,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004998291683889571,
|
|
"loss": 5.9105,
|
|
"mean_token_accuracy": 0.14012753888964652,
|
|
"num_tokens": 4544967.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"entropy": 6.0055500030517575,
|
|
"epoch": 0.20751942869145137,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000499827999662687,
|
|
"loss": 5.9915,
|
|
"mean_token_accuracy": 0.1313713811337948,
|
|
"num_tokens": 4554646.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"entropy": 6.090430212020874,
|
|
"epoch": 0.20793950850661624,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998268269537046,
|
|
"loss": 6.0166,
|
|
"mean_token_accuracy": 0.13576155975461007,
|
|
"num_tokens": 4564040.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"entropy": 6.014799499511719,
|
|
"epoch": 0.20835958832178114,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004998256502620308,
|
|
"loss": 6.0293,
|
|
"mean_token_accuracy": 0.13742104843258857,
|
|
"num_tokens": 4573758.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"entropy": 6.133381319046021,
|
|
"epoch": 0.208779668136946,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998244695876864,
|
|
"loss": 6.0616,
|
|
"mean_token_accuracy": 0.13331351354718207,
|
|
"num_tokens": 4582097.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"entropy": 5.951827907562256,
|
|
"epoch": 0.2091997479521109,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004998232849306921,
|
|
"loss": 6.0184,
|
|
"mean_token_accuracy": 0.13623687997460365,
|
|
"num_tokens": 4590687.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"entropy": 6.113849639892578,
|
|
"epoch": 0.20961982776727578,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004998220962910693,
|
|
"loss": 6.0063,
|
|
"mean_token_accuracy": 0.1295908585190773,
|
|
"num_tokens": 4599497.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"entropy": 6.004902267456055,
|
|
"epoch": 0.21003990758244068,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004998209036688386,
|
|
"loss": 5.9761,
|
|
"mean_token_accuracy": 0.13407944440841674,
|
|
"num_tokens": 4607958.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"entropy": 6.0813335418701175,
|
|
"epoch": 0.21045998739760555,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004998197070640216,
|
|
"loss": 6.1175,
|
|
"mean_token_accuracy": 0.1263462521135807,
|
|
"num_tokens": 4617515.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"entropy": 6.137440729141235,
|
|
"epoch": 0.21088006721277042,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004998185064766391,
|
|
"loss": 5.9907,
|
|
"mean_token_accuracy": 0.1353794366121292,
|
|
"num_tokens": 4627037.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"entropy": 5.9435793399810795,
|
|
"epoch": 0.21130014702793531,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004998173019067127,
|
|
"loss": 5.9898,
|
|
"mean_token_accuracy": 0.13374351039528848,
|
|
"num_tokens": 4637393.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"entropy": 5.990574741363526,
|
|
"epoch": 0.21172022684310018,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998160933542633,
|
|
"loss": 6.0354,
|
|
"mean_token_accuracy": 0.1225339263677597,
|
|
"num_tokens": 4646832.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"entropy": 6.078363418579102,
|
|
"epoch": 0.21214030665826508,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998148808193128,
|
|
"loss": 6.0571,
|
|
"mean_token_accuracy": 0.1361754283308983,
|
|
"num_tokens": 4655719.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"entropy": 6.027440595626831,
|
|
"epoch": 0.21256038647342995,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998136643018823,
|
|
"loss": 6.0247,
|
|
"mean_token_accuracy": 0.13285491690039636,
|
|
"num_tokens": 4665364.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"entropy": 6.0071735858917235,
|
|
"epoch": 0.21298046628859482,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004998124438019935,
|
|
"loss": 5.9795,
|
|
"mean_token_accuracy": 0.13230021819472312,
|
|
"num_tokens": 4674760.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"entropy": 5.971972846984864,
|
|
"epoch": 0.21340054610375972,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004998112193196681,
|
|
"loss": 5.9064,
|
|
"mean_token_accuracy": 0.1363543339073658,
|
|
"num_tokens": 4683900.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"entropy": 5.9554856300354,
|
|
"epoch": 0.2138206259189246,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998099908549277,
|
|
"loss": 5.9628,
|
|
"mean_token_accuracy": 0.12749610617756843,
|
|
"num_tokens": 4693915.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"entropy": 5.917505264282227,
|
|
"epoch": 0.2142407057340895,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.000499808758407794,
|
|
"loss": 5.8105,
|
|
"mean_token_accuracy": 0.14476394206285476,
|
|
"num_tokens": 4703102.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"entropy": 6.020478820800781,
|
|
"epoch": 0.21466078554925436,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004998075219782889,
|
|
"loss": 6.0403,
|
|
"mean_token_accuracy": 0.13109349682927132,
|
|
"num_tokens": 4712925.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"entropy": 6.050074434280395,
|
|
"epoch": 0.21508086536441923,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004998062815664344,
|
|
"loss": 5.9753,
|
|
"mean_token_accuracy": 0.13155975714325904,
|
|
"num_tokens": 4722641.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"entropy": 5.953602123260498,
|
|
"epoch": 0.21550094517958412,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004998050371722524,
|
|
"loss": 6.0504,
|
|
"mean_token_accuracy": 0.12833617106080056,
|
|
"num_tokens": 4732603.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"entropy": 5.983808517456055,
|
|
"epoch": 0.215921024994749,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004998037887957649,
|
|
"loss": 5.8814,
|
|
"mean_token_accuracy": 0.1358911283314228,
|
|
"num_tokens": 4742644.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"entropy": 6.108869409561157,
|
|
"epoch": 0.2163411048099139,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004998025364369939,
|
|
"loss": 6.2019,
|
|
"mean_token_accuracy": 0.1304102584719658,
|
|
"num_tokens": 4751482.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"entropy": 6.196333599090576,
|
|
"epoch": 0.21676118462507876,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004998012800959619,
|
|
"loss": 6.0606,
|
|
"mean_token_accuracy": 0.13010098412632942,
|
|
"num_tokens": 4760593.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"entropy": 6.040443658828735,
|
|
"epoch": 0.21718126444024366,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004998000197726909,
|
|
"loss": 6.0456,
|
|
"mean_token_accuracy": 0.13714693188667298,
|
|
"num_tokens": 4769294.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"entropy": 6.037139892578125,
|
|
"epoch": 0.21760134425540853,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004997987554672033,
|
|
"loss": 5.9468,
|
|
"mean_token_accuracy": 0.1388247735798359,
|
|
"num_tokens": 4779239.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"entropy": 6.028454113006592,
|
|
"epoch": 0.2180214240705734,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997974871795215,
|
|
"loss": 6.0394,
|
|
"mean_token_accuracy": 0.1312641680240631,
|
|
"num_tokens": 4788211.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"entropy": 6.064166069030762,
|
|
"epoch": 0.2184415038857383,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.000499796214909668,
|
|
"loss": 6.0092,
|
|
"mean_token_accuracy": 0.13773878663778305,
|
|
"num_tokens": 4797921.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"entropy": 6.0709045886993405,
|
|
"epoch": 0.21886158370090317,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997949386576653,
|
|
"loss": 6.0042,
|
|
"mean_token_accuracy": 0.1314603127539158,
|
|
"num_tokens": 4807772.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"entropy": 5.99371829032898,
|
|
"epoch": 0.21928166351606806,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499793658423536,
|
|
"loss": 6.0192,
|
|
"mean_token_accuracy": 0.13455276489257811,
|
|
"num_tokens": 4817999.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"entropy": 6.059015130996704,
|
|
"epoch": 0.21970174333123293,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004997923742073028,
|
|
"loss": 5.9804,
|
|
"mean_token_accuracy": 0.14437463730573655,
|
|
"num_tokens": 4826679.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"entropy": 5.943931245803833,
|
|
"epoch": 0.2201218231463978,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997910860089884,
|
|
"loss": 5.9832,
|
|
"mean_token_accuracy": 0.13589627295732498,
|
|
"num_tokens": 4834998.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"entropy": 6.057482719421387,
|
|
"epoch": 0.2205419029615627,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997897938286156,
|
|
"loss": 5.9317,
|
|
"mean_token_accuracy": 0.13643529042601585,
|
|
"num_tokens": 4843635.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"entropy": 6.018534517288208,
|
|
"epoch": 0.22096198277672757,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004997884976662075,
|
|
"loss": 6.055,
|
|
"mean_token_accuracy": 0.1327926956117153,
|
|
"num_tokens": 4852027.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"entropy": 6.1170580863952635,
|
|
"epoch": 0.22138206259189247,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997871975217868,
|
|
"loss": 5.9753,
|
|
"mean_token_accuracy": 0.14027014896273612,
|
|
"num_tokens": 4861244.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"entropy": 5.881848526000977,
|
|
"epoch": 0.22180214240705734,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997858933953768,
|
|
"loss": 5.8911,
|
|
"mean_token_accuracy": 0.13821944668889047,
|
|
"num_tokens": 4869902.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"entropy": 5.9110313892364506,
|
|
"epoch": 0.2222222222222222,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997845852870004,
|
|
"loss": 5.8706,
|
|
"mean_token_accuracy": 0.1410742297768593,
|
|
"num_tokens": 4878502.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"entropy": 5.945323467254639,
|
|
"epoch": 0.2226423020373871,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997832731966806,
|
|
"loss": 5.9249,
|
|
"mean_token_accuracy": 0.1411375291645527,
|
|
"num_tokens": 4888348.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"entropy": 5.97375717163086,
|
|
"epoch": 0.22306238185255198,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997819571244411,
|
|
"loss": 5.9955,
|
|
"mean_token_accuracy": 0.13679953366518022,
|
|
"num_tokens": 4897302.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"entropy": 6.008918142318725,
|
|
"epoch": 0.22348246166771688,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004997806370703049,
|
|
"loss": 6.0213,
|
|
"mean_token_accuracy": 0.13776542693376542,
|
|
"num_tokens": 4907078.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"entropy": 5.947820472717285,
|
|
"epoch": 0.22390254148288175,
|
|
"grad_norm": 1.0078125,
|
|
"learning_rate": 0.0004997793130342954,
|
|
"loss": 5.8494,
|
|
"mean_token_accuracy": 0.13728704303503036,
|
|
"num_tokens": 4917489.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"entropy": 5.903385925292969,
|
|
"epoch": 0.22432262129804661,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997779850164363,
|
|
"loss": 5.9513,
|
|
"mean_token_accuracy": 0.1372543305158615,
|
|
"num_tokens": 4927073.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"entropy": 6.074141645431519,
|
|
"epoch": 0.2247427011132115,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004997766530167508,
|
|
"loss": 6.0449,
|
|
"mean_token_accuracy": 0.13193764314055442,
|
|
"num_tokens": 4935464.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"entropy": 6.150123214721679,
|
|
"epoch": 0.22516278092837638,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004997753170352627,
|
|
"loss": 6.1293,
|
|
"mean_token_accuracy": 0.13081697300076484,
|
|
"num_tokens": 4944718.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"entropy": 6.070659351348877,
|
|
"epoch": 0.22558286074354128,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004997739770719955,
|
|
"loss": 6.0077,
|
|
"mean_token_accuracy": 0.13340400233864785,
|
|
"num_tokens": 4954223.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"entropy": 5.978194713592529,
|
|
"epoch": 0.22600294055870615,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.000499772633126973,
|
|
"loss": 6.0415,
|
|
"mean_token_accuracy": 0.12924405336380004,
|
|
"num_tokens": 4963371.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"entropy": 5.975288677215576,
|
|
"epoch": 0.22642302037387105,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997712852002192,
|
|
"loss": 5.9086,
|
|
"mean_token_accuracy": 0.1422348402440548,
|
|
"num_tokens": 4972973.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"entropy": 6.000183725357056,
|
|
"epoch": 0.22684310018903592,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004997699332917578,
|
|
"loss": 6.141,
|
|
"mean_token_accuracy": 0.12485837489366532,
|
|
"num_tokens": 4982808.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"entropy": 6.124313545227051,
|
|
"epoch": 0.2272631800042008,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004997685774016127,
|
|
"loss": 6.0087,
|
|
"mean_token_accuracy": 0.13304658830165864,
|
|
"num_tokens": 4992427.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"entropy": 6.108034229278564,
|
|
"epoch": 0.22768325981936569,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499767217529808,
|
|
"loss": 6.1924,
|
|
"mean_token_accuracy": 0.12314376682043075,
|
|
"num_tokens": 5003562.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"entropy": 6.024952030181884,
|
|
"epoch": 0.22810333963453056,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997658536763678,
|
|
"loss": 5.8848,
|
|
"mean_token_accuracy": 0.13965026810765266,
|
|
"num_tokens": 5013429.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"entropy": 6.031959342956543,
|
|
"epoch": 0.22852341944969545,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004997644858413163,
|
|
"loss": 6.0173,
|
|
"mean_token_accuracy": 0.1369493454694748,
|
|
"num_tokens": 5022045.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"entropy": 5.9200338363647464,
|
|
"epoch": 0.22894349926486032,
|
|
"grad_norm": 1.0390625,
|
|
"learning_rate": 0.0004997631140246775,
|
|
"loss": 5.8591,
|
|
"mean_token_accuracy": 0.14441144168376924,
|
|
"num_tokens": 5032260.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"entropy": 5.958108282089233,
|
|
"epoch": 0.2293635790800252,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499761738226476,
|
|
"loss": 5.9041,
|
|
"mean_token_accuracy": 0.1375686287879944,
|
|
"num_tokens": 5041688.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"entropy": 5.965337133407592,
|
|
"epoch": 0.2297836588951901,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.000499760358446736,
|
|
"loss": 6.0072,
|
|
"mean_token_accuracy": 0.13241190686821938,
|
|
"num_tokens": 5051005.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"entropy": 6.077162218093872,
|
|
"epoch": 0.23020373871035496,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499758974685482,
|
|
"loss": 5.9379,
|
|
"mean_token_accuracy": 0.13544429317116738,
|
|
"num_tokens": 5060084.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"entropy": 5.981188869476318,
|
|
"epoch": 0.23062381852551986,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997575869427385,
|
|
"loss": 5.953,
|
|
"mean_token_accuracy": 0.13910676240921022,
|
|
"num_tokens": 5069081.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"entropy": 5.978755378723145,
|
|
"epoch": 0.23104389834068473,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.00049975619521853,
|
|
"loss": 5.9429,
|
|
"mean_token_accuracy": 0.13454415425658225,
|
|
"num_tokens": 5078597.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"entropy": 5.911752319335937,
|
|
"epoch": 0.2314639781558496,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997547995128814,
|
|
"loss": 5.9829,
|
|
"mean_token_accuracy": 0.13646793067455293,
|
|
"num_tokens": 5087607.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"entropy": 6.035622882843017,
|
|
"epoch": 0.2318840579710145,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004997533998258171,
|
|
"loss": 5.9832,
|
|
"mean_token_accuracy": 0.13701630160212516,
|
|
"num_tokens": 5097412.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"entropy": 6.060332536697388,
|
|
"epoch": 0.23230413778617937,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997519961573622,
|
|
"loss": 6.0518,
|
|
"mean_token_accuracy": 0.1287323147058487,
|
|
"num_tokens": 5105817.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"entropy": 6.132694864273072,
|
|
"epoch": 0.23272421760134426,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004997505885075414,
|
|
"loss": 6.0843,
|
|
"mean_token_accuracy": 0.13087237104773522,
|
|
"num_tokens": 5114958.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"entropy": 6.002990245819092,
|
|
"epoch": 0.23314429741650913,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997491768763795,
|
|
"loss": 6.0022,
|
|
"mean_token_accuracy": 0.13458002656698226,
|
|
"num_tokens": 5123728.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"entropy": 6.000336790084839,
|
|
"epoch": 0.23356437723167403,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997477612639018,
|
|
"loss": 6.0532,
|
|
"mean_token_accuracy": 0.12724062129855157,
|
|
"num_tokens": 5134099.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"entropy": 6.150645542144775,
|
|
"epoch": 0.2339844570468389,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004997463416701332,
|
|
"loss": 6.0567,
|
|
"mean_token_accuracy": 0.12823428884148597,
|
|
"num_tokens": 5142934.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"entropy": 5.973050594329834,
|
|
"epoch": 0.23440453686200377,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997449180950989,
|
|
"loss": 5.9005,
|
|
"mean_token_accuracy": 0.15188876092433928,
|
|
"num_tokens": 5151835.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"entropy": 5.945583391189575,
|
|
"epoch": 0.23482461667716867,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004997434905388241,
|
|
"loss": 5.9533,
|
|
"mean_token_accuracy": 0.14123927503824235,
|
|
"num_tokens": 5161136.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"entropy": 5.966424179077149,
|
|
"epoch": 0.23524469649233354,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499742059001334,
|
|
"loss": 5.887,
|
|
"mean_token_accuracy": 0.141261176019907,
|
|
"num_tokens": 5170741.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"entropy": 5.931414556503296,
|
|
"epoch": 0.23566477630749844,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004997406234826541,
|
|
"loss": 5.9226,
|
|
"mean_token_accuracy": 0.14311096221208572,
|
|
"num_tokens": 5180549.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"entropy": 5.932320833206177,
|
|
"epoch": 0.2360848561226633,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004997391839828098,
|
|
"loss": 5.8876,
|
|
"mean_token_accuracy": 0.14145613387227057,
|
|
"num_tokens": 5189486.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"entropy": 5.9496715545654295,
|
|
"epoch": 0.23650493593782818,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997377405018266,
|
|
"loss": 5.9643,
|
|
"mean_token_accuracy": 0.1311965249478817,
|
|
"num_tokens": 5198525.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"entropy": 6.014245939254761,
|
|
"epoch": 0.23692501575299307,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.00049973629303973,
|
|
"loss": 6.0158,
|
|
"mean_token_accuracy": 0.13539923653006553,
|
|
"num_tokens": 5207124.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"entropy": 5.944891500473022,
|
|
"epoch": 0.23734509556815794,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004997348415965457,
|
|
"loss": 5.85,
|
|
"mean_token_accuracy": 0.1407323271036148,
|
|
"num_tokens": 5216529.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"entropy": 5.993428897857666,
|
|
"epoch": 0.23776517538332284,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997333861722995,
|
|
"loss": 5.9831,
|
|
"mean_token_accuracy": 0.138421493768692,
|
|
"num_tokens": 5225796.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"entropy": 6.012037515640259,
|
|
"epoch": 0.2381852551984877,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499731926767017,
|
|
"loss": 6.0269,
|
|
"mean_token_accuracy": 0.13556732088327408,
|
|
"num_tokens": 5233876.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"entropy": 5.9344642639160154,
|
|
"epoch": 0.23860533501365258,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997304633807242,
|
|
"loss": 6.0019,
|
|
"mean_token_accuracy": 0.12836523801088334,
|
|
"num_tokens": 5244782.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"entropy": 5.987623119354248,
|
|
"epoch": 0.23902541482881748,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997289960134468,
|
|
"loss": 5.9579,
|
|
"mean_token_accuracy": 0.1335374064743519,
|
|
"num_tokens": 5253453.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"entropy": 5.978561115264893,
|
|
"epoch": 0.23944549464398235,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004997275246652111,
|
|
"loss": 5.9635,
|
|
"mean_token_accuracy": 0.14111838340759278,
|
|
"num_tokens": 5262355.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"entropy": 5.952275371551513,
|
|
"epoch": 0.23986557445914725,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499726049336043,
|
|
"loss": 5.8897,
|
|
"mean_token_accuracy": 0.1419723652303219,
|
|
"num_tokens": 5271959.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"entropy": 6.006475210189819,
|
|
"epoch": 0.24028565427431212,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997245700259686,
|
|
"loss": 5.9216,
|
|
"mean_token_accuracy": 0.1432231843471527,
|
|
"num_tokens": 5281393.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"entropy": 5.981584358215332,
|
|
"epoch": 0.240705734089477,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997230867350141,
|
|
"loss": 6.0599,
|
|
"mean_token_accuracy": 0.13176842033863068,
|
|
"num_tokens": 5290979.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"entropy": 6.072908639907837,
|
|
"epoch": 0.24112581390464188,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004997215994632059,
|
|
"loss": 5.9983,
|
|
"mean_token_accuracy": 0.13976021558046342,
|
|
"num_tokens": 5300263.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"entropy": 5.99395980834961,
|
|
"epoch": 0.24154589371980675,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004997201082105704,
|
|
"loss": 6.0192,
|
|
"mean_token_accuracy": 0.13117292299866676,
|
|
"num_tokens": 5309522.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"entropy": 6.010568284988404,
|
|
"epoch": 0.24196597353497165,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997186129771338,
|
|
"loss": 6.0248,
|
|
"mean_token_accuracy": 0.13696857616305352,
|
|
"num_tokens": 5319770.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"entropy": 6.136290264129639,
|
|
"epoch": 0.24238605335013652,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997171137629226,
|
|
"loss": 6.0295,
|
|
"mean_token_accuracy": 0.1379177153110504,
|
|
"num_tokens": 5328400.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"entropy": 5.855829429626465,
|
|
"epoch": 0.24280613316530142,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997156105679636,
|
|
"loss": 5.8334,
|
|
"mean_token_accuracy": 0.14593008533120155,
|
|
"num_tokens": 5336338.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"entropy": 5.898982286453247,
|
|
"epoch": 0.2432262129804663,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997141033922832,
|
|
"loss": 5.9375,
|
|
"mean_token_accuracy": 0.13418934270739555,
|
|
"num_tokens": 5345391.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"entropy": 6.035576057434082,
|
|
"epoch": 0.24364629279563116,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004997125922359081,
|
|
"loss": 5.9508,
|
|
"mean_token_accuracy": 0.13234915360808372,
|
|
"num_tokens": 5354709.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"entropy": 6.013994407653809,
|
|
"epoch": 0.24406637261079606,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997110770988652,
|
|
"loss": 5.8796,
|
|
"mean_token_accuracy": 0.1399741917848587,
|
|
"num_tokens": 5363738.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"entropy": 5.9528099536895756,
|
|
"epoch": 0.24448645242596093,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004997095579811813,
|
|
"loss": 6.0023,
|
|
"mean_token_accuracy": 0.13593828454613685,
|
|
"num_tokens": 5373583.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"entropy": 6.03057632446289,
|
|
"epoch": 0.24490653224112582,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004997080348828833,
|
|
"loss": 6.0477,
|
|
"mean_token_accuracy": 0.1340787522494793,
|
|
"num_tokens": 5383486.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"entropy": 5.969451522827148,
|
|
"epoch": 0.2453266120562907,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004997065078039981,
|
|
"loss": 5.9591,
|
|
"mean_token_accuracy": 0.1328844092786312,
|
|
"num_tokens": 5391974.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"entropy": 6.031870555877686,
|
|
"epoch": 0.24574669187145556,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004997049767445529,
|
|
"loss": 5.9995,
|
|
"mean_token_accuracy": 0.13087670058012008,
|
|
"num_tokens": 5400882.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"entropy": 6.1388874530792235,
|
|
"epoch": 0.24616677168662046,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004997034417045746,
|
|
"loss": 5.958,
|
|
"mean_token_accuracy": 0.13255189657211303,
|
|
"num_tokens": 5410538.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"entropy": 5.891916513442993,
|
|
"epoch": 0.24658685150178533,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004997019026840907,
|
|
"loss": 5.8523,
|
|
"mean_token_accuracy": 0.1406748116016388,
|
|
"num_tokens": 5419406.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"entropy": 5.81290192604065,
|
|
"epoch": 0.24700693131695023,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004997003596831282,
|
|
"loss": 5.9661,
|
|
"mean_token_accuracy": 0.13368260413408278,
|
|
"num_tokens": 5428817.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"entropy": 6.030734586715698,
|
|
"epoch": 0.2474270111321151,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996988127017145,
|
|
"loss": 5.9967,
|
|
"mean_token_accuracy": 0.1356920287013054,
|
|
"num_tokens": 5438277.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"entropy": 5.991678762435913,
|
|
"epoch": 0.24784709094728,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004996972617398772,
|
|
"loss": 6.0095,
|
|
"mean_token_accuracy": 0.13253712952136992,
|
|
"num_tokens": 5447440.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"entropy": 5.990732574462891,
|
|
"epoch": 0.24826717076244487,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996957067976435,
|
|
"loss": 5.9368,
|
|
"mean_token_accuracy": 0.13873122334480287,
|
|
"num_tokens": 5455988.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"entropy": 6.013759565353394,
|
|
"epoch": 0.24868725057760974,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996941478750411,
|
|
"loss": 5.9498,
|
|
"mean_token_accuracy": 0.13479771465063095,
|
|
"num_tokens": 5464996.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"entropy": 6.04653902053833,
|
|
"epoch": 0.24910733039277463,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996925849720975,
|
|
"loss": 6.0789,
|
|
"mean_token_accuracy": 0.12909941822290422,
|
|
"num_tokens": 5474174.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"entropy": 6.1094592094421385,
|
|
"epoch": 0.2495274102079395,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004996910180888405,
|
|
"loss": 5.9515,
|
|
"mean_token_accuracy": 0.14010420814156532,
|
|
"num_tokens": 5482838.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"entropy": 5.933987855911255,
|
|
"epoch": 0.2499474900231044,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996894472252977,
|
|
"loss": 5.9796,
|
|
"mean_token_accuracy": 0.13611237108707427,
|
|
"num_tokens": 5491616.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"entropy": 5.946248006820679,
|
|
"epoch": 0.25036756983826924,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996878723814973,
|
|
"loss": 5.9758,
|
|
"mean_token_accuracy": 0.13201134279370308,
|
|
"num_tokens": 5500942.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"entropy": 6.011964797973633,
|
|
"epoch": 0.25078764965343414,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996862935574667,
|
|
"loss": 5.9171,
|
|
"mean_token_accuracy": 0.13369757011532785,
|
|
"num_tokens": 5510078.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"entropy": 5.91240873336792,
|
|
"epoch": 0.25120772946859904,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996847107532342,
|
|
"loss": 5.9402,
|
|
"mean_token_accuracy": 0.13632848113775253,
|
|
"num_tokens": 5518924.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"entropy": 6.007929849624634,
|
|
"epoch": 0.25162780928376394,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004996831239688277,
|
|
"loss": 5.9543,
|
|
"mean_token_accuracy": 0.13016238808631897,
|
|
"num_tokens": 5527385.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"entropy": 5.906063604354858,
|
|
"epoch": 0.2520478890989288,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996815332042754,
|
|
"loss": 5.8144,
|
|
"mean_token_accuracy": 0.14219059348106383,
|
|
"num_tokens": 5536781.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 0.2520478890989288,
|
|
"eval_entropy": 5.731865753634434,
|
|
"eval_loss": 5.98077917098999,
|
|
"eval_mean_token_accuracy": 0.14166069063261308,
|
|
"eval_num_tokens": 5536781.0,
|
|
"eval_runtime": 27.289,
|
|
"eval_samples_per_second": 1369.272,
|
|
"eval_steps_per_second": 171.168,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"entropy": 5.928550434112549,
|
|
"epoch": 0.2524679689140937,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004996799384596054,
|
|
"loss": 6.0018,
|
|
"mean_token_accuracy": 0.13845922499895097,
|
|
"num_tokens": 5545893.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"entropy": 6.047553873062133,
|
|
"epoch": 0.2528880487292586,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996783397348461,
|
|
"loss": 5.9557,
|
|
"mean_token_accuracy": 0.13133809193968773,
|
|
"num_tokens": 5555818.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"entropy": 5.98364634513855,
|
|
"epoch": 0.2533081285444234,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004996767370300256,
|
|
"loss": 5.9338,
|
|
"mean_token_accuracy": 0.13684593588113786,
|
|
"num_tokens": 5565331.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"entropy": 6.016663599014282,
|
|
"epoch": 0.2537282083595883,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996751303451724,
|
|
"loss": 5.9132,
|
|
"mean_token_accuracy": 0.1414400041103363,
|
|
"num_tokens": 5574003.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"entropy": 5.934581279754639,
|
|
"epoch": 0.2541482881747532,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996735196803149,
|
|
"loss": 5.8367,
|
|
"mean_token_accuracy": 0.14427052065730095,
|
|
"num_tokens": 5582517.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"entropy": 5.937156009674072,
|
|
"epoch": 0.2545683679899181,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996719050354818,
|
|
"loss": 6.0272,
|
|
"mean_token_accuracy": 0.1350693427026272,
|
|
"num_tokens": 5591952.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"entropy": 5.999459314346313,
|
|
"epoch": 0.25498844780508295,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996702864107015,
|
|
"loss": 5.9271,
|
|
"mean_token_accuracy": 0.1392418310046196,
|
|
"num_tokens": 5601460.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"entropy": 6.104486131668091,
|
|
"epoch": 0.25540852762024785,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004996686638060028,
|
|
"loss": 6.05,
|
|
"mean_token_accuracy": 0.13465244546532631,
|
|
"num_tokens": 5610776.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"entropy": 5.98651351928711,
|
|
"epoch": 0.25582860743541275,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004996670372214144,
|
|
"loss": 5.9593,
|
|
"mean_token_accuracy": 0.1381534829735756,
|
|
"num_tokens": 5619627.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"entropy": 5.846577882766724,
|
|
"epoch": 0.2562486872505776,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996654066569651,
|
|
"loss": 5.8254,
|
|
"mean_token_accuracy": 0.1441572315990925,
|
|
"num_tokens": 5628969.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"entropy": 5.9285993576049805,
|
|
"epoch": 0.2566687670657425,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996637721126839,
|
|
"loss": 5.8895,
|
|
"mean_token_accuracy": 0.13785991445183754,
|
|
"num_tokens": 5638629.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"entropy": 6.003174924850464,
|
|
"epoch": 0.2570888468809074,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004996621335885996,
|
|
"loss": 5.9755,
|
|
"mean_token_accuracy": 0.13731449097394943,
|
|
"num_tokens": 5647571.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"entropy": 5.987164306640625,
|
|
"epoch": 0.2575089266960722,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004996604910847413,
|
|
"loss": 5.8754,
|
|
"mean_token_accuracy": 0.14926859214901925,
|
|
"num_tokens": 5656709.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"entropy": 5.953637361526489,
|
|
"epoch": 0.2579290065112371,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499658844601138,
|
|
"loss": 6.0688,
|
|
"mean_token_accuracy": 0.1354634039103985,
|
|
"num_tokens": 5665714.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"entropy": 6.097519016265869,
|
|
"epoch": 0.258349086326402,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.000499657194137819,
|
|
"loss": 6.0234,
|
|
"mean_token_accuracy": 0.13649424612522126,
|
|
"num_tokens": 5675854.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"entropy": 5.9704999923706055,
|
|
"epoch": 0.2587691661415669,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996555396948136,
|
|
"loss": 5.8448,
|
|
"mean_token_accuracy": 0.13540665656328202,
|
|
"num_tokens": 5685690.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"entropy": 5.92389030456543,
|
|
"epoch": 0.25918924595673176,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004996538812721509,
|
|
"loss": 5.8958,
|
|
"mean_token_accuracy": 0.1428774431347847,
|
|
"num_tokens": 5695766.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"entropy": 5.918001127243042,
|
|
"epoch": 0.25960932577189666,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004996522188698603,
|
|
"loss": 5.9586,
|
|
"mean_token_accuracy": 0.13920465260744094,
|
|
"num_tokens": 5704365.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"entropy": 6.103138256072998,
|
|
"epoch": 0.26002940558706156,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004996505524879714,
|
|
"loss": 6.0694,
|
|
"mean_token_accuracy": 0.13200636729598045,
|
|
"num_tokens": 5713345.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"entropy": 6.0067219734191895,
|
|
"epoch": 0.2604494854022264,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004996488821265137,
|
|
"loss": 5.8544,
|
|
"mean_token_accuracy": 0.1430236168205738,
|
|
"num_tokens": 5722907.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"entropy": 5.831529951095581,
|
|
"epoch": 0.2608695652173913,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004996472077855166,
|
|
"loss": 5.898,
|
|
"mean_token_accuracy": 0.14116744548082352,
|
|
"num_tokens": 5731589.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"entropy": 5.962000036239624,
|
|
"epoch": 0.2612896450325562,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.00049964552946501,
|
|
"loss": 5.9005,
|
|
"mean_token_accuracy": 0.13415754735469818,
|
|
"num_tokens": 5739922.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"entropy": 5.892115211486816,
|
|
"epoch": 0.2617097248477211,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004996438471650235,
|
|
"loss": 5.8122,
|
|
"mean_token_accuracy": 0.14354836270213128,
|
|
"num_tokens": 5749206.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"entropy": 5.935215997695923,
|
|
"epoch": 0.26212980466288593,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004996421608855869,
|
|
"loss": 5.8703,
|
|
"mean_token_accuracy": 0.1430413119494915,
|
|
"num_tokens": 5758803.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"entropy": 5.921274280548095,
|
|
"epoch": 0.26254988447805083,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996404706267301,
|
|
"loss": 5.9525,
|
|
"mean_token_accuracy": 0.1340932957828045,
|
|
"num_tokens": 5768368.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"entropy": 5.895799207687378,
|
|
"epoch": 0.26296996429321573,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000499638776388483,
|
|
"loss": 5.8028,
|
|
"mean_token_accuracy": 0.14530150592327118,
|
|
"num_tokens": 5776707.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"entropy": 5.956259107589721,
|
|
"epoch": 0.26339004410838057,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004996370781708757,
|
|
"loss": 5.9872,
|
|
"mean_token_accuracy": 0.13445577397942543,
|
|
"num_tokens": 5787037.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"entropy": 5.948802375793457,
|
|
"epoch": 0.26381012392354547,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004996353759739382,
|
|
"loss": 5.9353,
|
|
"mean_token_accuracy": 0.14012779742479325,
|
|
"num_tokens": 5796630.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"entropy": 5.918614721298217,
|
|
"epoch": 0.26423020373871037,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004996336697977007,
|
|
"loss": 5.9569,
|
|
"mean_token_accuracy": 0.13389407992362976,
|
|
"num_tokens": 5806402.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"entropy": 5.899946784973144,
|
|
"epoch": 0.2646502835538752,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004996319596421933,
|
|
"loss": 5.8948,
|
|
"mean_token_accuracy": 0.14252272099256516,
|
|
"num_tokens": 5815742.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"entropy": 5.881059455871582,
|
|
"epoch": 0.2650703633690401,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004996302455074466,
|
|
"loss": 5.9116,
|
|
"mean_token_accuracy": 0.13981909155845643,
|
|
"num_tokens": 5824915.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"entropy": 6.034554386138916,
|
|
"epoch": 0.265490443184205,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996285273934906,
|
|
"loss": 5.9346,
|
|
"mean_token_accuracy": 0.13784030005335807,
|
|
"num_tokens": 5834978.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"entropy": 6.000072288513183,
|
|
"epoch": 0.2659105229993699,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.000499626805300356,
|
|
"loss": 6.0896,
|
|
"mean_token_accuracy": 0.13608634248375892,
|
|
"num_tokens": 5845684.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"entropy": 6.076979541778565,
|
|
"epoch": 0.26633060281453474,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004996250792280732,
|
|
"loss": 5.9723,
|
|
"mean_token_accuracy": 0.131855096668005,
|
|
"num_tokens": 5854905.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"entropy": 6.017704057693481,
|
|
"epoch": 0.26675068262969964,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004996233491766727,
|
|
"loss": 5.995,
|
|
"mean_token_accuracy": 0.1348758891224861,
|
|
"num_tokens": 5863654.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"entropy": 5.958924627304077,
|
|
"epoch": 0.26717076244486454,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996216151461854,
|
|
"loss": 5.9782,
|
|
"mean_token_accuracy": 0.14263538494706154,
|
|
"num_tokens": 5872442.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"entropy": 6.007285785675049,
|
|
"epoch": 0.2675908422600294,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004996198771366417,
|
|
"loss": 5.9003,
|
|
"mean_token_accuracy": 0.14180114939808847,
|
|
"num_tokens": 5882372.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"entropy": 5.762058162689209,
|
|
"epoch": 0.2680109220751943,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996181351480726,
|
|
"loss": 5.7096,
|
|
"mean_token_accuracy": 0.1481250509619713,
|
|
"num_tokens": 5891113.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"entropy": 5.856069707870484,
|
|
"epoch": 0.2684310018903592,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004996163891805089,
|
|
"loss": 5.9546,
|
|
"mean_token_accuracy": 0.14241180717945098,
|
|
"num_tokens": 5899582.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"entropy": 6.006877613067627,
|
|
"epoch": 0.2688510817055241,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004996146392339815,
|
|
"loss": 5.903,
|
|
"mean_token_accuracy": 0.13792204037308692,
|
|
"num_tokens": 5908938.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"entropy": 5.92903504371643,
|
|
"epoch": 0.2692711615206889,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004996128853085215,
|
|
"loss": 5.873,
|
|
"mean_token_accuracy": 0.14175159782171248,
|
|
"num_tokens": 5918055.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"entropy": 5.962112808227539,
|
|
"epoch": 0.2696912413358538,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004996111274041598,
|
|
"loss": 5.8745,
|
|
"mean_token_accuracy": 0.1343413420021534,
|
|
"num_tokens": 5926744.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"entropy": 5.949939441680908,
|
|
"epoch": 0.2701113211510187,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004996093655209277,
|
|
"loss": 5.9643,
|
|
"mean_token_accuracy": 0.13674649894237517,
|
|
"num_tokens": 5936521.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"entropy": 6.075492525100708,
|
|
"epoch": 0.27053140096618356,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004996075996588563,
|
|
"loss": 6.0296,
|
|
"mean_token_accuracy": 0.1298151694238186,
|
|
"num_tokens": 5945010.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"entropy": 5.941525030136108,
|
|
"epoch": 0.27095148078134845,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499605829817977,
|
|
"loss": 5.9212,
|
|
"mean_token_accuracy": 0.13907130137085916,
|
|
"num_tokens": 5953766.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"entropy": 5.937248849868775,
|
|
"epoch": 0.27137156059651335,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499604055998321,
|
|
"loss": 5.8382,
|
|
"mean_token_accuracy": 0.14228671863675119,
|
|
"num_tokens": 5962168.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"entropy": 5.887500286102295,
|
|
"epoch": 0.2717916404116782,
|
|
"grad_norm": 1.03125,
|
|
"learning_rate": 0.0004996022781999198,
|
|
"loss": 5.8689,
|
|
"mean_token_accuracy": 0.14494396820664407,
|
|
"num_tokens": 5971627.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"entropy": 5.927862691879272,
|
|
"epoch": 0.2722117202268431,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499600496422805,
|
|
"loss": 5.949,
|
|
"mean_token_accuracy": 0.13576936945319176,
|
|
"num_tokens": 5981775.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"entropy": 5.942725610733032,
|
|
"epoch": 0.272631800042008,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.000499598710667008,
|
|
"loss": 5.8857,
|
|
"mean_token_accuracy": 0.1394686594605446,
|
|
"num_tokens": 5991097.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"entropy": 5.9221861362457275,
|
|
"epoch": 0.2730518798571729,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004995969209325604,
|
|
"loss": 5.9369,
|
|
"mean_token_accuracy": 0.1354317285120487,
|
|
"num_tokens": 5999517.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"entropy": 5.942779064178467,
|
|
"epoch": 0.2734719596723377,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995951272194941,
|
|
"loss": 5.9079,
|
|
"mean_token_accuracy": 0.13176682814955712,
|
|
"num_tokens": 6008545.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"entropy": 6.00344705581665,
|
|
"epoch": 0.2738920394875026,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004995933295278407,
|
|
"loss": 5.8989,
|
|
"mean_token_accuracy": 0.13847036063671112,
|
|
"num_tokens": 6017366.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"entropy": 5.900600910186768,
|
|
"epoch": 0.2743121193026675,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004995915278576321,
|
|
"loss": 5.843,
|
|
"mean_token_accuracy": 0.14386921525001525,
|
|
"num_tokens": 6025597.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"entropy": 6.0085962295532225,
|
|
"epoch": 0.27473219911783237,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004995897222089004,
|
|
"loss": 5.9437,
|
|
"mean_token_accuracy": 0.1424303874373436,
|
|
"num_tokens": 6034239.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"entropy": 6.0732769012451175,
|
|
"epoch": 0.27515227893299726,
|
|
"grad_norm": 1.109375,
|
|
"learning_rate": 0.0004995879125816772,
|
|
"loss": 5.9769,
|
|
"mean_token_accuracy": 0.13496886044740677,
|
|
"num_tokens": 6043837.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"entropy": 5.846703004837036,
|
|
"epoch": 0.27557235874816216,
|
|
"grad_norm": 1.0703125,
|
|
"learning_rate": 0.0004995860989759949,
|
|
"loss": 5.9195,
|
|
"mean_token_accuracy": 0.14464289993047713,
|
|
"num_tokens": 6053217.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"entropy": 6.013448190689087,
|
|
"epoch": 0.27599243856332706,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995842813918855,
|
|
"loss": 5.9292,
|
|
"mean_token_accuracy": 0.141995108127594,
|
|
"num_tokens": 6061553.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"entropy": 5.900888109207154,
|
|
"epoch": 0.2764125183784919,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004995824598293812,
|
|
"loss": 5.8195,
|
|
"mean_token_accuracy": 0.14140584841370582,
|
|
"num_tokens": 6070080.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"entropy": 5.960742044448852,
|
|
"epoch": 0.2768325981936568,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004995806342885142,
|
|
"loss": 5.9653,
|
|
"mean_token_accuracy": 0.14112535640597343,
|
|
"num_tokens": 6078438.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"entropy": 5.993828868865966,
|
|
"epoch": 0.2772526780088217,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.000499578804769317,
|
|
"loss": 5.9591,
|
|
"mean_token_accuracy": 0.13578373640775682,
|
|
"num_tokens": 6087794.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"entropy": 6.002421045303345,
|
|
"epoch": 0.27767275782398654,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004995769712718218,
|
|
"loss": 5.9684,
|
|
"mean_token_accuracy": 0.13969296216964722,
|
|
"num_tokens": 6096709.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"entropy": 5.897220087051392,
|
|
"epoch": 0.27809283763915144,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004995751337960613,
|
|
"loss": 5.9029,
|
|
"mean_token_accuracy": 0.14022547677159308,
|
|
"num_tokens": 6105866.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"entropy": 6.0129883766174315,
|
|
"epoch": 0.27851291745431633,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004995732923420679,
|
|
"loss": 5.8481,
|
|
"mean_token_accuracy": 0.1446751207113266,
|
|
"num_tokens": 6114882.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"entropy": 5.85595121383667,
|
|
"epoch": 0.2789329972694812,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004995714469098743,
|
|
"loss": 5.8116,
|
|
"mean_token_accuracy": 0.1394299313426018,
|
|
"num_tokens": 6123978.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"entropy": 5.841559362411499,
|
|
"epoch": 0.2793530770846461,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000499569597499513,
|
|
"loss": 5.9636,
|
|
"mean_token_accuracy": 0.1420348674058914,
|
|
"num_tokens": 6133246.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"entropy": 5.949489116668701,
|
|
"epoch": 0.27977315689981097,
|
|
"grad_norm": 1.0546875,
|
|
"learning_rate": 0.0004995677441110172,
|
|
"loss": 5.8295,
|
|
"mean_token_accuracy": 0.1390916422009468,
|
|
"num_tokens": 6142865.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"entropy": 5.958270597457886,
|
|
"epoch": 0.28019323671497587,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004995658867444192,
|
|
"loss": 5.9185,
|
|
"mean_token_accuracy": 0.13403844311833382,
|
|
"num_tokens": 6152492.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"entropy": 5.9247795104980465,
|
|
"epoch": 0.2806133165301407,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995640253997523,
|
|
"loss": 5.9182,
|
|
"mean_token_accuracy": 0.13453099131584167,
|
|
"num_tokens": 6161953.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"entropy": 5.869791507720947,
|
|
"epoch": 0.2810333963453056,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995621600770492,
|
|
"loss": 5.7688,
|
|
"mean_token_accuracy": 0.14543831422924997,
|
|
"num_tokens": 6171467.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"entropy": 5.874020433425903,
|
|
"epoch": 0.2814534761604705,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004995602907763431,
|
|
"loss": 5.8552,
|
|
"mean_token_accuracy": 0.1376187428832054,
|
|
"num_tokens": 6180646.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"entropy": 5.929383373260498,
|
|
"epoch": 0.28187355597563535,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995584174976672,
|
|
"loss": 5.8713,
|
|
"mean_token_accuracy": 0.1346499465405941,
|
|
"num_tokens": 6189832.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"entropy": 5.927767086029053,
|
|
"epoch": 0.28229363579080025,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004995565402410544,
|
|
"loss": 5.7654,
|
|
"mean_token_accuracy": 0.14761848300695418,
|
|
"num_tokens": 6198339.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"entropy": 5.892761945724487,
|
|
"epoch": 0.28271371560596514,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004995546590065383,
|
|
"loss": 5.8571,
|
|
"mean_token_accuracy": 0.1422846756875515,
|
|
"num_tokens": 6207564.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"entropy": 5.9255454540252686,
|
|
"epoch": 0.28313379542113004,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004995527737941518,
|
|
"loss": 5.9347,
|
|
"mean_token_accuracy": 0.13853738307952881,
|
|
"num_tokens": 6216056.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"entropy": 5.896576976776123,
|
|
"epoch": 0.2835538752362949,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004995508846039287,
|
|
"loss": 5.8965,
|
|
"mean_token_accuracy": 0.13626314997673034,
|
|
"num_tokens": 6225573.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"entropy": 6.006501722335815,
|
|
"epoch": 0.2839739550514598,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004995489914359023,
|
|
"loss": 6.0096,
|
|
"mean_token_accuracy": 0.13380790427327155,
|
|
"num_tokens": 6235057.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"entropy": 6.008948469161988,
|
|
"epoch": 0.2843940348666247,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004995470942901061,
|
|
"loss": 5.9217,
|
|
"mean_token_accuracy": 0.13976462185382843,
|
|
"num_tokens": 6244164.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"entropy": 6.019155550003052,
|
|
"epoch": 0.2848141146817895,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004995451931665738,
|
|
"loss": 5.9344,
|
|
"mean_token_accuracy": 0.13991687223315238,
|
|
"num_tokens": 6253095.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"entropy": 5.957593202590942,
|
|
"epoch": 0.2852341944969544,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499543288065339,
|
|
"loss": 5.8639,
|
|
"mean_token_accuracy": 0.13952580690383912,
|
|
"num_tokens": 6261134.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"entropy": 5.833978176116943,
|
|
"epoch": 0.2856542743121193,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004995413789864354,
|
|
"loss": 5.879,
|
|
"mean_token_accuracy": 0.14211501479148864,
|
|
"num_tokens": 6270384.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"entropy": 5.952889156341553,
|
|
"epoch": 0.28607435412728416,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004995394659298971,
|
|
"loss": 5.8153,
|
|
"mean_token_accuracy": 0.14977512061595916,
|
|
"num_tokens": 6279702.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"entropy": 5.889900207519531,
|
|
"epoch": 0.28649443394244906,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004995375488957576,
|
|
"loss": 5.8612,
|
|
"mean_token_accuracy": 0.13872572034597397,
|
|
"num_tokens": 6288297.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"entropy": 5.9331536293029785,
|
|
"epoch": 0.28691451375761395,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.000499535627884051,
|
|
"loss": 5.9516,
|
|
"mean_token_accuracy": 0.13464640453457832,
|
|
"num_tokens": 6297288.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"entropy": 6.120673799514771,
|
|
"epoch": 0.28733459357277885,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004995337028948115,
|
|
"loss": 5.974,
|
|
"mean_token_accuracy": 0.13250542506575586,
|
|
"num_tokens": 6306719.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"entropy": 5.837868595123291,
|
|
"epoch": 0.2877546733879437,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004995317739280731,
|
|
"loss": 5.8018,
|
|
"mean_token_accuracy": 0.14693671017885207,
|
|
"num_tokens": 6316639.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"entropy": 5.928999614715576,
|
|
"epoch": 0.2881747532031086,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004995298409838699,
|
|
"loss": 5.9251,
|
|
"mean_token_accuracy": 0.1393180750310421,
|
|
"num_tokens": 6326879.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"entropy": 5.877130842208862,
|
|
"epoch": 0.2885948330182735,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.000499527904062236,
|
|
"loss": 5.8293,
|
|
"mean_token_accuracy": 0.14481945633888244,
|
|
"num_tokens": 6335729.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"entropy": 5.915560340881347,
|
|
"epoch": 0.28901491283343833,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004995259631632061,
|
|
"loss": 5.8973,
|
|
"mean_token_accuracy": 0.13230996280908586,
|
|
"num_tokens": 6345154.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"entropy": 5.962369394302368,
|
|
"epoch": 0.28943499264860323,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004995240182868143,
|
|
"loss": 5.8479,
|
|
"mean_token_accuracy": 0.14117665588855743,
|
|
"num_tokens": 6354309.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"entropy": 5.844361209869385,
|
|
"epoch": 0.2898550724637681,
|
|
"grad_norm": 1.0625,
|
|
"learning_rate": 0.0004995220694330951,
|
|
"loss": 5.8255,
|
|
"mean_token_accuracy": 0.14228973686695098,
|
|
"num_tokens": 6363389.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"entropy": 5.88813967704773,
|
|
"epoch": 0.290275152278933,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004995201166020832,
|
|
"loss": 5.8844,
|
|
"mean_token_accuracy": 0.13614363521337508,
|
|
"num_tokens": 6372475.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"entropy": 5.972552013397217,
|
|
"epoch": 0.29069523209409787,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000499518159793813,
|
|
"loss": 5.8329,
|
|
"mean_token_accuracy": 0.14460824504494668,
|
|
"num_tokens": 6380906.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"entropy": 5.866478300094604,
|
|
"epoch": 0.29111531190926276,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000499516199008319,
|
|
"loss": 5.8563,
|
|
"mean_token_accuracy": 0.14013876989483834,
|
|
"num_tokens": 6390085.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"entropy": 5.941966962814331,
|
|
"epoch": 0.29153539172442766,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004995142342456364,
|
|
"loss": 5.9125,
|
|
"mean_token_accuracy": 0.13554606810212136,
|
|
"num_tokens": 6399441.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"entropy": 6.016114854812622,
|
|
"epoch": 0.2919554715395925,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004995122655057997,
|
|
"loss": 5.9876,
|
|
"mean_token_accuracy": 0.13846278935670853,
|
|
"num_tokens": 6408995.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"entropy": 5.820221567153931,
|
|
"epoch": 0.2923755513547574,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004995102927888437,
|
|
"loss": 5.7443,
|
|
"mean_token_accuracy": 0.1459552101790905,
|
|
"num_tokens": 6418080.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"entropy": 5.911068105697632,
|
|
"epoch": 0.2927956311699223,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004995083160948036,
|
|
"loss": 5.9075,
|
|
"mean_token_accuracy": 0.13615152686834336,
|
|
"num_tokens": 6426732.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"entropy": 5.973341417312622,
|
|
"epoch": 0.29321571098508714,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004995063354237141,
|
|
"loss": 5.9199,
|
|
"mean_token_accuracy": 0.14315774142742158,
|
|
"num_tokens": 6435957.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"entropy": 5.914236402511596,
|
|
"epoch": 0.29363579080025204,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004995043507756107,
|
|
"loss": 5.864,
|
|
"mean_token_accuracy": 0.13712269440293312,
|
|
"num_tokens": 6445642.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"entropy": 5.931887197494507,
|
|
"epoch": 0.29405587061541694,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004995023621505282,
|
|
"loss": 5.906,
|
|
"mean_token_accuracy": 0.1387566529214382,
|
|
"num_tokens": 6454664.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"entropy": 5.8483740329742435,
|
|
"epoch": 0.29447595043058183,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499500369548502,
|
|
"loss": 5.8204,
|
|
"mean_token_accuracy": 0.13878127932548523,
|
|
"num_tokens": 6463224.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"entropy": 6.101959562301635,
|
|
"epoch": 0.2948960302457467,
|
|
"grad_norm": 1.1015625,
|
|
"learning_rate": 0.0004994983729695674,
|
|
"loss": 6.0552,
|
|
"mean_token_accuracy": 0.13270595893263817,
|
|
"num_tokens": 6473112.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"entropy": 5.9638348579406735,
|
|
"epoch": 0.2953161100609116,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004994963724137595,
|
|
"loss": 5.8923,
|
|
"mean_token_accuracy": 0.14195917025208474,
|
|
"num_tokens": 6482062.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"entropy": 5.8792516708374025,
|
|
"epoch": 0.29573618987607647,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004994943678811142,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.13623269721865655,
|
|
"num_tokens": 6490568.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"entropy": 5.920031452178955,
|
|
"epoch": 0.2961562696912413,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004994923593716667,
|
|
"loss": 5.9294,
|
|
"mean_token_accuracy": 0.1400933049619198,
|
|
"num_tokens": 6500815.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"entropy": 5.929981470108032,
|
|
"epoch": 0.2965763495064062,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994903468854527,
|
|
"loss": 5.8119,
|
|
"mean_token_accuracy": 0.1481688842177391,
|
|
"num_tokens": 6509529.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"entropy": 5.864474868774414,
|
|
"epoch": 0.2969964293215711,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004994883304225077,
|
|
"loss": 5.8729,
|
|
"mean_token_accuracy": 0.13643766716122627,
|
|
"num_tokens": 6517934.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"entropy": 5.941485595703125,
|
|
"epoch": 0.297416509136736,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004994863099828675,
|
|
"loss": 5.8357,
|
|
"mean_token_accuracy": 0.13737112134695054,
|
|
"num_tokens": 6526098.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"entropy": 5.857544040679931,
|
|
"epoch": 0.29783658895190085,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.000499484285566568,
|
|
"loss": 5.8676,
|
|
"mean_token_accuracy": 0.13825918808579446,
|
|
"num_tokens": 6535831.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"entropy": 5.862282800674438,
|
|
"epoch": 0.29825666876706575,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994822571736449,
|
|
"loss": 5.7848,
|
|
"mean_token_accuracy": 0.138202403485775,
|
|
"num_tokens": 6545704.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"entropy": 5.923550367355347,
|
|
"epoch": 0.29867674858223064,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994802248041342,
|
|
"loss": 5.8178,
|
|
"mean_token_accuracy": 0.14043487086892129,
|
|
"num_tokens": 6554423.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"entropy": 5.915227842330933,
|
|
"epoch": 0.2990968283973955,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000499478188458072,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.14151394963264466,
|
|
"num_tokens": 6563989.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"entropy": 5.978702878952026,
|
|
"epoch": 0.2995169082125604,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004994761481354943,
|
|
"loss": 6.0168,
|
|
"mean_token_accuracy": 0.1372433789074421,
|
|
"num_tokens": 6572745.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"entropy": 6.085881900787354,
|
|
"epoch": 0.2999369880277253,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994741038364371,
|
|
"loss": 6.0023,
|
|
"mean_token_accuracy": 0.1347330242395401,
|
|
"num_tokens": 6581723.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"entropy": 5.837055253982544,
|
|
"epoch": 0.3003570678428901,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004994720555609369,
|
|
"loss": 5.7255,
|
|
"mean_token_accuracy": 0.14691844433546067,
|
|
"num_tokens": 6590342.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"entropy": 5.793679332733154,
|
|
"epoch": 0.300777147658055,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004994700033090297,
|
|
"loss": 5.7828,
|
|
"mean_token_accuracy": 0.14932364374399185,
|
|
"num_tokens": 6599206.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"entropy": 6.014719247817993,
|
|
"epoch": 0.3011972274732199,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000499467947080752,
|
|
"loss": 6.1117,
|
|
"mean_token_accuracy": 0.1273707590997219,
|
|
"num_tokens": 6608947.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"entropy": 5.987322616577148,
|
|
"epoch": 0.3016173072883848,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004994658868761402,
|
|
"loss": 5.8883,
|
|
"mean_token_accuracy": 0.14657592847943307,
|
|
"num_tokens": 6618378.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"entropy": 5.946252870559692,
|
|
"epoch": 0.30203738710354966,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004994638226952307,
|
|
"loss": 5.9383,
|
|
"mean_token_accuracy": 0.1343943029642105,
|
|
"num_tokens": 6627527.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"entropy": 5.973345470428467,
|
|
"epoch": 0.30245746691871456,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994617545380604,
|
|
"loss": 5.8799,
|
|
"mean_token_accuracy": 0.1394343391060829,
|
|
"num_tokens": 6636964.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"entropy": 5.846103811264038,
|
|
"epoch": 0.30287754673387945,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004994596824046656,
|
|
"loss": 5.8373,
|
|
"mean_token_accuracy": 0.14053009524941446,
|
|
"num_tokens": 6646074.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"entropy": 5.920796251296997,
|
|
"epoch": 0.3032976265490443,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.000499457606295083,
|
|
"loss": 5.897,
|
|
"mean_token_accuracy": 0.14094351455569268,
|
|
"num_tokens": 6655027.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"entropy": 5.804931497573852,
|
|
"epoch": 0.3037177063642092,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004994555262093495,
|
|
"loss": 5.6923,
|
|
"mean_token_accuracy": 0.1544790118932724,
|
|
"num_tokens": 6663747.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"entropy": 6.0041478157043455,
|
|
"epoch": 0.3041377861793741,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000499453442147502,
|
|
"loss": 6.0113,
|
|
"mean_token_accuracy": 0.13213684484362603,
|
|
"num_tokens": 6672922.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"entropy": 5.917838859558105,
|
|
"epoch": 0.304557865994539,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004994513541095773,
|
|
"loss": 5.8314,
|
|
"mean_token_accuracy": 0.14857635647058487,
|
|
"num_tokens": 6682233.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"entropy": 5.912606573104858,
|
|
"epoch": 0.30497794580970383,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004994492620956126,
|
|
"loss": 5.8901,
|
|
"mean_token_accuracy": 0.1396655946969986,
|
|
"num_tokens": 6691593.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"entropy": 5.919918155670166,
|
|
"epoch": 0.30539802562486873,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004994471661056445,
|
|
"loss": 5.8861,
|
|
"mean_token_accuracy": 0.14072583466768265,
|
|
"num_tokens": 6701318.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"entropy": 6.009689807891846,
|
|
"epoch": 0.3058181054400336,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994450661397106,
|
|
"loss": 5.892,
|
|
"mean_token_accuracy": 0.1426208183169365,
|
|
"num_tokens": 6710059.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"entropy": 6.047525215148926,
|
|
"epoch": 0.30623818525519847,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.000499442962197848,
|
|
"loss": 5.975,
|
|
"mean_token_accuracy": 0.13458139076828957,
|
|
"num_tokens": 6719811.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"entropy": 5.868422555923462,
|
|
"epoch": 0.30665826507036337,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004994408542800937,
|
|
"loss": 5.8541,
|
|
"mean_token_accuracy": 0.14217756688594818,
|
|
"num_tokens": 6728789.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"entropy": 5.868635082244873,
|
|
"epoch": 0.30707834488552826,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004994387423864855,
|
|
"loss": 5.8459,
|
|
"mean_token_accuracy": 0.14152047485113145,
|
|
"num_tokens": 6737706.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"entropy": 5.855863761901856,
|
|
"epoch": 0.3074984247006931,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004994366265170603,
|
|
"loss": 5.7885,
|
|
"mean_token_accuracy": 0.15381008386611938,
|
|
"num_tokens": 6746861.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"entropy": 5.958421134948731,
|
|
"epoch": 0.307918504515858,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004994345066718558,
|
|
"loss": 5.9914,
|
|
"mean_token_accuracy": 0.13055020123720168,
|
|
"num_tokens": 6755242.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"entropy": 6.003214979171753,
|
|
"epoch": 0.3083385843310229,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004994323828509098,
|
|
"loss": 5.9324,
|
|
"mean_token_accuracy": 0.13040165677666665,
|
|
"num_tokens": 6764549.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"entropy": 5.916761636734009,
|
|
"epoch": 0.3087586641461878,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004994302550542596,
|
|
"loss": 5.9094,
|
|
"mean_token_accuracy": 0.145879103243351,
|
|
"num_tokens": 6774123.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"entropy": 5.818877267837524,
|
|
"epoch": 0.30917874396135264,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000499428123281943,
|
|
"loss": 5.6883,
|
|
"mean_token_accuracy": 0.14885310381650924,
|
|
"num_tokens": 6782922.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"entropy": 5.8812672138214115,
|
|
"epoch": 0.30959882377651754,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994259875339978,
|
|
"loss": 5.9311,
|
|
"mean_token_accuracy": 0.13911449760198594,
|
|
"num_tokens": 6792042.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"entropy": 6.0224377632141115,
|
|
"epoch": 0.31001890359168244,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004994238478104617,
|
|
"loss": 5.9298,
|
|
"mean_token_accuracy": 0.13841390311717988,
|
|
"num_tokens": 6800994.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"entropy": 5.922906446456909,
|
|
"epoch": 0.3104389834068473,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004994217041113727,
|
|
"loss": 5.8716,
|
|
"mean_token_accuracy": 0.14270309880375862,
|
|
"num_tokens": 6809938.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"entropy": 5.951845121383667,
|
|
"epoch": 0.3108590632220122,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004994195564367688,
|
|
"loss": 5.9849,
|
|
"mean_token_accuracy": 0.1360231176018715,
|
|
"num_tokens": 6820289.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"entropy": 5.994351577758789,
|
|
"epoch": 0.3112791430371771,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004994174047866882,
|
|
"loss": 5.8235,
|
|
"mean_token_accuracy": 0.14149386510252954,
|
|
"num_tokens": 6830068.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"entropy": 5.771749830245971,
|
|
"epoch": 0.3116992228523419,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004994152491611686,
|
|
"loss": 5.8521,
|
|
"mean_token_accuracy": 0.1437979094684124,
|
|
"num_tokens": 6838591.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"entropy": 5.865754890441894,
|
|
"epoch": 0.3121193026675068,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004994130895602485,
|
|
"loss": 5.8204,
|
|
"mean_token_accuracy": 0.13915161341428756,
|
|
"num_tokens": 6847796.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"entropy": 6.016102695465088,
|
|
"epoch": 0.3125393824826717,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000499410925983966,
|
|
"loss": 5.9097,
|
|
"mean_token_accuracy": 0.14341016113758087,
|
|
"num_tokens": 6856585.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"entropy": 5.82035460472107,
|
|
"epoch": 0.3129594622978366,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004994087584323596,
|
|
"loss": 5.8224,
|
|
"mean_token_accuracy": 0.1492151916027069,
|
|
"num_tokens": 6865757.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"entropy": 5.874684762954712,
|
|
"epoch": 0.31337954211300145,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004994065869054676,
|
|
"loss": 5.8703,
|
|
"mean_token_accuracy": 0.13963879272341728,
|
|
"num_tokens": 6875371.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"entropy": 5.973430156707764,
|
|
"epoch": 0.31379962192816635,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004994044114033283,
|
|
"loss": 5.9223,
|
|
"mean_token_accuracy": 0.1317138932645321,
|
|
"num_tokens": 6884050.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"entropy": 5.9677238941192625,
|
|
"epoch": 0.31421970174333125,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004994022319259806,
|
|
"loss": 5.8772,
|
|
"mean_token_accuracy": 0.14338692352175714,
|
|
"num_tokens": 6893079.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"entropy": 5.878354215621949,
|
|
"epoch": 0.3146397815584961,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004994000484734629,
|
|
"loss": 5.9909,
|
|
"mean_token_accuracy": 0.14075467139482498,
|
|
"num_tokens": 6903100.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"entropy": 5.928855514526367,
|
|
"epoch": 0.315059861373661,
|
|
"grad_norm": 1.078125,
|
|
"learning_rate": 0.0004993978610458137,
|
|
"loss": 5.837,
|
|
"mean_token_accuracy": 0.14158818423748015,
|
|
"num_tokens": 6912164.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"entropy": 5.849039506912232,
|
|
"epoch": 0.3154799411888259,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004993956696430721,
|
|
"loss": 5.8489,
|
|
"mean_token_accuracy": 0.13852613866329194,
|
|
"num_tokens": 6921183.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"entropy": 5.947899580001831,
|
|
"epoch": 0.3159000210039908,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993934742652768,
|
|
"loss": 5.918,
|
|
"mean_token_accuracy": 0.13974586948752404,
|
|
"num_tokens": 6931325.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"entropy": 5.930049276351928,
|
|
"epoch": 0.3163201008191556,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004993912749124665,
|
|
"loss": 5.8111,
|
|
"mean_token_accuracy": 0.14738105684518815,
|
|
"num_tokens": 6940234.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"entropy": 5.850810146331787,
|
|
"epoch": 0.3167401806343205,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004993890715846804,
|
|
"loss": 5.9136,
|
|
"mean_token_accuracy": 0.1437673717737198,
|
|
"num_tokens": 6949067.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"entropy": 5.959416913986206,
|
|
"epoch": 0.3171602604494854,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993868642819574,
|
|
"loss": 5.8851,
|
|
"mean_token_accuracy": 0.13802511468529702,
|
|
"num_tokens": 6959085.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"entropy": 5.905188941955567,
|
|
"epoch": 0.31758034026465026,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004993846530043367,
|
|
"loss": 5.9143,
|
|
"mean_token_accuracy": 0.1347965881228447,
|
|
"num_tokens": 6967392.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"entropy": 5.903332805633545,
|
|
"epoch": 0.31800042007981516,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993824377518574,
|
|
"loss": 5.8461,
|
|
"mean_token_accuracy": 0.1488291099667549,
|
|
"num_tokens": 6976369.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"entropy": 5.983600234985351,
|
|
"epoch": 0.31842049989498006,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004993802185245587,
|
|
"loss": 5.8623,
|
|
"mean_token_accuracy": 0.14806569889187812,
|
|
"num_tokens": 6985889.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"entropy": 5.85070834159851,
|
|
"epoch": 0.3188405797101449,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00049937799532248,
|
|
"loss": 5.8915,
|
|
"mean_token_accuracy": 0.1322341412305832,
|
|
"num_tokens": 6995396.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"entropy": 6.083252477645874,
|
|
"epoch": 0.3192606595253098,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993757681456607,
|
|
"loss": 5.94,
|
|
"mean_token_accuracy": 0.13651170060038567,
|
|
"num_tokens": 7004666.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"entropy": 5.98352370262146,
|
|
"epoch": 0.3196807393404747,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993735369941401,
|
|
"loss": 5.9741,
|
|
"mean_token_accuracy": 0.13378495275974273,
|
|
"num_tokens": 7014608.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"entropy": 5.872314596176148,
|
|
"epoch": 0.3201008191556396,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004993713018679579,
|
|
"loss": 5.8367,
|
|
"mean_token_accuracy": 0.13734248280525208,
|
|
"num_tokens": 7023671.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"entropy": 5.887682008743286,
|
|
"epoch": 0.32052089897080444,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004993690627671536,
|
|
"loss": 5.8912,
|
|
"mean_token_accuracy": 0.13631067648530007,
|
|
"num_tokens": 7033786.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"entropy": 5.925025987625122,
|
|
"epoch": 0.32094097878596933,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004993668196917669,
|
|
"loss": 5.8026,
|
|
"mean_token_accuracy": 0.14422772228717803,
|
|
"num_tokens": 7042162.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"entropy": 5.993115663528442,
|
|
"epoch": 0.32136105860113423,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993645726418375,
|
|
"loss": 5.946,
|
|
"mean_token_accuracy": 0.13835487216711045,
|
|
"num_tokens": 7051903.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"entropy": 5.828875875473022,
|
|
"epoch": 0.3217811384162991,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993623216174053,
|
|
"loss": 5.7755,
|
|
"mean_token_accuracy": 0.15191829651594163,
|
|
"num_tokens": 7060229.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"entropy": 5.8507789134979244,
|
|
"epoch": 0.32220121823146397,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.00049936006661851,
|
|
"loss": 5.8708,
|
|
"mean_token_accuracy": 0.13942239433526993,
|
|
"num_tokens": 7069040.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"entropy": 5.9059672355651855,
|
|
"epoch": 0.32262129804662887,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004993578076451917,
|
|
"loss": 5.755,
|
|
"mean_token_accuracy": 0.1418701082468033,
|
|
"num_tokens": 7078409.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"entropy": 5.73867621421814,
|
|
"epoch": 0.32304137786179377,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993555446974903,
|
|
"loss": 5.841,
|
|
"mean_token_accuracy": 0.13865280598402024,
|
|
"num_tokens": 7087983.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"entropy": 5.833849382400513,
|
|
"epoch": 0.3234614576769586,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000499353277775446,
|
|
"loss": 5.7975,
|
|
"mean_token_accuracy": 0.14864109456539154,
|
|
"num_tokens": 7097277.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"entropy": 5.906425857543946,
|
|
"epoch": 0.3238815374921235,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004993510068790989,
|
|
"loss": 5.6902,
|
|
"mean_token_accuracy": 0.15845565646886825,
|
|
"num_tokens": 7105918.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"entropy": 5.729353284835815,
|
|
"epoch": 0.3243016173072884,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004993487320084892,
|
|
"loss": 5.7663,
|
|
"mean_token_accuracy": 0.15123388469219207,
|
|
"num_tokens": 7115049.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"entropy": 5.881364727020264,
|
|
"epoch": 0.32472169712245325,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004993464531636573,
|
|
"loss": 5.848,
|
|
"mean_token_accuracy": 0.1397578552365303,
|
|
"num_tokens": 7124862.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"entropy": 5.853383731842041,
|
|
"epoch": 0.32514177693761814,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993441703446435,
|
|
"loss": 5.7449,
|
|
"mean_token_accuracy": 0.14876592308282852,
|
|
"num_tokens": 7133280.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"entropy": 5.926945161819458,
|
|
"epoch": 0.32556185675278304,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004993418835514882,
|
|
"loss": 5.9468,
|
|
"mean_token_accuracy": 0.13614385947585106,
|
|
"num_tokens": 7142446.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"entropy": 5.948226881027222,
|
|
"epoch": 0.3259819365679479,
|
|
"grad_norm": 1.046875,
|
|
"learning_rate": 0.0004993395927842321,
|
|
"loss": 5.8437,
|
|
"mean_token_accuracy": 0.1408906787633896,
|
|
"num_tokens": 7152143.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"entropy": 5.944639015197754,
|
|
"epoch": 0.3264020163831128,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004993372980429155,
|
|
"loss": 5.9314,
|
|
"mean_token_accuracy": 0.13912539780139924,
|
|
"num_tokens": 7162046.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"entropy": 5.925892972946167,
|
|
"epoch": 0.3268220961982777,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004993349993275792,
|
|
"loss": 5.8046,
|
|
"mean_token_accuracy": 0.141475647687912,
|
|
"num_tokens": 7171557.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"entropy": 5.751782655715942,
|
|
"epoch": 0.3272421760134426,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004993326966382639,
|
|
"loss": 5.7152,
|
|
"mean_token_accuracy": 0.14672790616750717,
|
|
"num_tokens": 7180927.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"entropy": 5.833934020996094,
|
|
"epoch": 0.3276622558286074,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004993303899750104,
|
|
"loss": 5.784,
|
|
"mean_token_accuracy": 0.14524296969175338,
|
|
"num_tokens": 7189552.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"entropy": 5.929954242706299,
|
|
"epoch": 0.3280823356437723,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004993280793378595,
|
|
"loss": 5.8197,
|
|
"mean_token_accuracy": 0.14072833955287933,
|
|
"num_tokens": 7197857.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"entropy": 5.886449480056763,
|
|
"epoch": 0.3285024154589372,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004993257647268522,
|
|
"loss": 5.7906,
|
|
"mean_token_accuracy": 0.15043871477246284,
|
|
"num_tokens": 7206785.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"entropy": 5.909309577941895,
|
|
"epoch": 0.32892249527410206,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004993234461420295,
|
|
"loss": 5.8845,
|
|
"mean_token_accuracy": 0.13852151483297348,
|
|
"num_tokens": 7216360.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"entropy": 5.842478799819946,
|
|
"epoch": 0.32934257508926695,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004993211235834326,
|
|
"loss": 5.6839,
|
|
"mean_token_accuracy": 0.16384934931993483,
|
|
"num_tokens": 7224890.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"entropy": 5.758263874053955,
|
|
"epoch": 0.32976265490443185,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004993187970511023,
|
|
"loss": 5.735,
|
|
"mean_token_accuracy": 0.16684153228998183,
|
|
"num_tokens": 7234442.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"entropy": 5.873496627807617,
|
|
"epoch": 0.33018273471959675,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004993164665450801,
|
|
"loss": 5.8937,
|
|
"mean_token_accuracy": 0.14423306286334991,
|
|
"num_tokens": 7244023.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"entropy": 5.877293682098388,
|
|
"epoch": 0.3306028145347616,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004993141320654072,
|
|
"loss": 5.7546,
|
|
"mean_token_accuracy": 0.14760554879903792,
|
|
"num_tokens": 7253548.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"entropy": 5.862598562240601,
|
|
"epoch": 0.3310228943499265,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.000499311793612125,
|
|
"loss": 5.8187,
|
|
"mean_token_accuracy": 0.14365085512399672,
|
|
"num_tokens": 7262962.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"entropy": 5.9090534210205075,
|
|
"epoch": 0.3314429741650914,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993094511852748,
|
|
"loss": 5.8412,
|
|
"mean_token_accuracy": 0.1450173959136009,
|
|
"num_tokens": 7272234.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"entropy": 5.899152231216431,
|
|
"epoch": 0.33186305398025623,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004993071047848983,
|
|
"loss": 5.8162,
|
|
"mean_token_accuracy": 0.1440458543598652,
|
|
"num_tokens": 7281524.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"entropy": 5.8174926280975345,
|
|
"epoch": 0.3322831337954211,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004993047544110368,
|
|
"loss": 5.7104,
|
|
"mean_token_accuracy": 0.1493962250649929,
|
|
"num_tokens": 7289601.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"entropy": 5.727307987213135,
|
|
"epoch": 0.332703213610586,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004993024000637321,
|
|
"loss": 5.6805,
|
|
"mean_token_accuracy": 0.15279524326324462,
|
|
"num_tokens": 7298508.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"entropy": 5.863152980804443,
|
|
"epoch": 0.33312329342575087,
|
|
"grad_norm": 1.1484375,
|
|
"learning_rate": 0.0004993000417430259,
|
|
"loss": 5.899,
|
|
"mean_token_accuracy": 0.14116615280508996,
|
|
"num_tokens": 7309065.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"entropy": 5.987276601791382,
|
|
"epoch": 0.33354337324091576,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.00049929767944896,
|
|
"loss": 5.9279,
|
|
"mean_token_accuracy": 0.14180676117539406,
|
|
"num_tokens": 7319669.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"entropy": 5.944264316558838,
|
|
"epoch": 0.33396345305608066,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004992953131815761,
|
|
"loss": 5.8675,
|
|
"mean_token_accuracy": 0.14308214634656907,
|
|
"num_tokens": 7328425.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"entropy": 5.837700128555298,
|
|
"epoch": 0.33438353287124556,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004992929429409164,
|
|
"loss": 5.7527,
|
|
"mean_token_accuracy": 0.1491951271891594,
|
|
"num_tokens": 7337369.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"entropy": 5.8214014053344725,
|
|
"epoch": 0.3348036126864104,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004992905687270225,
|
|
"loss": 5.8183,
|
|
"mean_token_accuracy": 0.1459653303027153,
|
|
"num_tokens": 7346829.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"entropy": 5.947172021865844,
|
|
"epoch": 0.3352236925015753,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004992881905399368,
|
|
"loss": 5.8631,
|
|
"mean_token_accuracy": 0.14260231107473373,
|
|
"num_tokens": 7355976.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"entropy": 5.880902576446533,
|
|
"epoch": 0.3356437723167402,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992858083797013,
|
|
"loss": 5.8357,
|
|
"mean_token_accuracy": 0.13988892063498498,
|
|
"num_tokens": 7365210.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"entropy": 5.887827444076538,
|
|
"epoch": 0.33606385213190504,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992834222463581,
|
|
"loss": 5.8916,
|
|
"mean_token_accuracy": 0.1366453118622303,
|
|
"num_tokens": 7374175.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"entropy": 5.9429937362670895,
|
|
"epoch": 0.33648393194706994,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004992810321399496,
|
|
"loss": 5.9143,
|
|
"mean_token_accuracy": 0.13915260806679725,
|
|
"num_tokens": 7383302.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"entropy": 5.919287061691284,
|
|
"epoch": 0.33690401176223483,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004992786380605182,
|
|
"loss": 5.8924,
|
|
"mean_token_accuracy": 0.1387575164437294,
|
|
"num_tokens": 7392746.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"entropy": 5.856390810012817,
|
|
"epoch": 0.33732409157739973,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004992762400081062,
|
|
"loss": 5.7403,
|
|
"mean_token_accuracy": 0.1431375488638878,
|
|
"num_tokens": 7401604.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"entropy": 5.85037088394165,
|
|
"epoch": 0.3377441713925646,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992738379827559,
|
|
"loss": 5.8333,
|
|
"mean_token_accuracy": 0.14066973105072975,
|
|
"num_tokens": 7410594.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"entropy": 5.8780660152435305,
|
|
"epoch": 0.33816425120772947,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004992714319845101,
|
|
"loss": 5.7395,
|
|
"mean_token_accuracy": 0.15344746708869933,
|
|
"num_tokens": 7418831.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"entropy": 5.772813749313355,
|
|
"epoch": 0.33858433102289437,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992690220134116,
|
|
"loss": 5.7892,
|
|
"mean_token_accuracy": 0.1462782494723797,
|
|
"num_tokens": 7427731.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"entropy": 5.971381282806396,
|
|
"epoch": 0.3390044108380592,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004992666080695027,
|
|
"loss": 5.9117,
|
|
"mean_token_accuracy": 0.13779002502560617,
|
|
"num_tokens": 7436447.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"entropy": 5.921543741226197,
|
|
"epoch": 0.3394244906532241,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992641901528262,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.14552520364522933,
|
|
"num_tokens": 7445352.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"entropy": 5.886599731445313,
|
|
"epoch": 0.339844570468389,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004992617682634252,
|
|
"loss": 5.8513,
|
|
"mean_token_accuracy": 0.14520843252539634,
|
|
"num_tokens": 7454298.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"entropy": 5.877497959136963,
|
|
"epoch": 0.34026465028355385,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992593424013424,
|
|
"loss": 5.8719,
|
|
"mean_token_accuracy": 0.14371060207486153,
|
|
"num_tokens": 7463543.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"entropy": 5.891790342330933,
|
|
"epoch": 0.34068473009871875,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004992569125666209,
|
|
"loss": 5.8933,
|
|
"mean_token_accuracy": 0.1396215297281742,
|
|
"num_tokens": 7472701.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"entropy": 6.004440689086914,
|
|
"epoch": 0.34110480991388364,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004992544787593037,
|
|
"loss": 5.8639,
|
|
"mean_token_accuracy": 0.13958193585276604,
|
|
"num_tokens": 7481123.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"entropy": 5.980106163024902,
|
|
"epoch": 0.34152488972904854,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004992520409794338,
|
|
"loss": 5.9379,
|
|
"mean_token_accuracy": 0.1414543256163597,
|
|
"num_tokens": 7490439.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"entropy": 5.900824117660522,
|
|
"epoch": 0.3419449695442134,
|
|
"grad_norm": 1.09375,
|
|
"learning_rate": 0.0004992495992270544,
|
|
"loss": 5.7958,
|
|
"mean_token_accuracy": 0.14631813019514084,
|
|
"num_tokens": 7499326.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"entropy": 5.891779184341431,
|
|
"epoch": 0.3423650493593783,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004992471535022089,
|
|
"loss": 5.8633,
|
|
"mean_token_accuracy": 0.14064768627285956,
|
|
"num_tokens": 7509407.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"entropy": 5.933660221099854,
|
|
"epoch": 0.3427851291745432,
|
|
"grad_norm": 1.0859375,
|
|
"learning_rate": 0.0004992447038049405,
|
|
"loss": 5.9161,
|
|
"mean_token_accuracy": 0.14266983717679976,
|
|
"num_tokens": 7518443.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"entropy": 5.83581509590149,
|
|
"epoch": 0.343205208989708,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004992422501352927,
|
|
"loss": 5.7669,
|
|
"mean_token_accuracy": 0.14985841438174247,
|
|
"num_tokens": 7527609.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"entropy": 5.924132442474365,
|
|
"epoch": 0.3436252888048729,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004992397924933089,
|
|
"loss": 5.8592,
|
|
"mean_token_accuracy": 0.13954362720251084,
|
|
"num_tokens": 7536890.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"entropy": 5.917565202713012,
|
|
"epoch": 0.3440453686200378,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004992373308790325,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.14937977269291877,
|
|
"num_tokens": 7546509.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"entropy": 5.8026703834533695,
|
|
"epoch": 0.3444654484352027,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004992348652925074,
|
|
"loss": 5.8363,
|
|
"mean_token_accuracy": 0.14076031297445296,
|
|
"num_tokens": 7555336.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"entropy": 5.921528148651123,
|
|
"epoch": 0.34488552825036756,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992323957337771,
|
|
"loss": 5.7996,
|
|
"mean_token_accuracy": 0.14651144444942474,
|
|
"num_tokens": 7565210.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"entropy": 5.882741546630859,
|
|
"epoch": 0.34530560806553245,
|
|
"grad_norm": 1.140625,
|
|
"learning_rate": 0.0004992299222028855,
|
|
"loss": 5.892,
|
|
"mean_token_accuracy": 0.1487639456987381,
|
|
"num_tokens": 7574516.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"entropy": 5.829581546783447,
|
|
"epoch": 0.34572568788069735,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004992274446998761,
|
|
"loss": 5.7261,
|
|
"mean_token_accuracy": 0.15058319717645646,
|
|
"num_tokens": 7583219.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"entropy": 5.973264932632446,
|
|
"epoch": 0.3461457676958622,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004992249632247929,
|
|
"loss": 5.9592,
|
|
"mean_token_accuracy": 0.13624709472060204,
|
|
"num_tokens": 7592050.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"entropy": 5.926892328262329,
|
|
"epoch": 0.3465658475110271,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004992224777776802,
|
|
"loss": 5.7982,
|
|
"mean_token_accuracy": 0.14514639526605605,
|
|
"num_tokens": 7600718.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"entropy": 5.865804290771484,
|
|
"epoch": 0.346985927326192,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992199883585816,
|
|
"loss": 5.8233,
|
|
"mean_token_accuracy": 0.14894086718559266,
|
|
"num_tokens": 7609191.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"entropy": 5.919741106033325,
|
|
"epoch": 0.34740600714135683,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004992174949675413,
|
|
"loss": 5.8602,
|
|
"mean_token_accuracy": 0.14337126538157463,
|
|
"num_tokens": 7618509.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"entropy": 5.859026050567627,
|
|
"epoch": 0.34782608695652173,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004992149976046037,
|
|
"loss": 5.7922,
|
|
"mean_token_accuracy": 0.1422334760427475,
|
|
"num_tokens": 7627851.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"entropy": 5.82733941078186,
|
|
"epoch": 0.3482461667716866,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004992124962698128,
|
|
"loss": 5.8424,
|
|
"mean_token_accuracy": 0.14138613119721413,
|
|
"num_tokens": 7636748.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"entropy": 5.9045960903167725,
|
|
"epoch": 0.3486662465868515,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000499209990963213,
|
|
"loss": 5.7782,
|
|
"mean_token_accuracy": 0.14405173510313035,
|
|
"num_tokens": 7645436.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"entropy": 5.942763233184815,
|
|
"epoch": 0.34908632640201637,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004992074816848487,
|
|
"loss": 5.912,
|
|
"mean_token_accuracy": 0.1420220673084259,
|
|
"num_tokens": 7655414.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"entropy": 5.806369590759277,
|
|
"epoch": 0.34950640621718126,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004992049684347642,
|
|
"loss": 5.6828,
|
|
"mean_token_accuracy": 0.1479445531964302,
|
|
"num_tokens": 7664295.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"entropy": 5.8392486572265625,
|
|
"epoch": 0.34992648603234616,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004992024512130042,
|
|
"loss": 5.8256,
|
|
"mean_token_accuracy": 0.14557857811450958,
|
|
"num_tokens": 7673295.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"entropy": 5.829631280899048,
|
|
"epoch": 0.350346565847511,
|
|
"grad_norm": 1.1171875,
|
|
"learning_rate": 0.0004991999300196132,
|
|
"loss": 5.8282,
|
|
"mean_token_accuracy": 0.14183629676699638,
|
|
"num_tokens": 7682932.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"entropy": 5.967007064819336,
|
|
"epoch": 0.3507666456626759,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004991974048546359,
|
|
"loss": 5.8389,
|
|
"mean_token_accuracy": 0.1410795919597149,
|
|
"num_tokens": 7692105.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"entropy": 5.839569425582885,
|
|
"epoch": 0.3511867254778408,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000499194875718117,
|
|
"loss": 5.8278,
|
|
"mean_token_accuracy": 0.14473577067255974,
|
|
"num_tokens": 7701294.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"entropy": 5.905254983901978,
|
|
"epoch": 0.3516068052930057,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004991923426101013,
|
|
"loss": 5.8161,
|
|
"mean_token_accuracy": 0.13896411135792733,
|
|
"num_tokens": 7710964.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"entropy": 5.987090635299682,
|
|
"epoch": 0.35202688510817054,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004991898055306337,
|
|
"loss": 5.9588,
|
|
"mean_token_accuracy": 0.13461354821920396,
|
|
"num_tokens": 7719938.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"entropy": 5.921365547180176,
|
|
"epoch": 0.35244696492333544,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004991872644797591,
|
|
"loss": 5.8516,
|
|
"mean_token_accuracy": 0.13835739120841026,
|
|
"num_tokens": 7729129.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"entropy": 5.872485494613647,
|
|
"epoch": 0.35286704473850034,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991847194575226,
|
|
"loss": 5.8744,
|
|
"mean_token_accuracy": 0.135695618391037,
|
|
"num_tokens": 7738506.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"entropy": 5.9815671920776365,
|
|
"epoch": 0.3532871245536652,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004991821704639693,
|
|
"loss": 5.9756,
|
|
"mean_token_accuracy": 0.1382329933345318,
|
|
"num_tokens": 7749320.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"entropy": 6.020629215240478,
|
|
"epoch": 0.3537072043688301,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004991796174991443,
|
|
"loss": 5.8318,
|
|
"mean_token_accuracy": 0.14441338777542115,
|
|
"num_tokens": 7758735.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"entropy": 5.784712409973144,
|
|
"epoch": 0.354127284183995,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004991770605630927,
|
|
"loss": 5.7838,
|
|
"mean_token_accuracy": 0.14566663056612014,
|
|
"num_tokens": 7767556.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"entropy": 5.859303379058838,
|
|
"epoch": 0.3545473639991598,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004991744996558599,
|
|
"loss": 5.8127,
|
|
"mean_token_accuracy": 0.14865762144327163,
|
|
"num_tokens": 7776615.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"entropy": 5.8661195755004885,
|
|
"epoch": 0.3549674438143247,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004991719347774913,
|
|
"loss": 5.8425,
|
|
"mean_token_accuracy": 0.1501207634806633,
|
|
"num_tokens": 7785288.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"entropy": 5.876074028015137,
|
|
"epoch": 0.3553875236294896,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004991693659280324,
|
|
"loss": 5.768,
|
|
"mean_token_accuracy": 0.14568567126989365,
|
|
"num_tokens": 7794381.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"entropy": 5.80495433807373,
|
|
"epoch": 0.3558076034446545,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004991667931075284,
|
|
"loss": 5.7286,
|
|
"mean_token_accuracy": 0.14424108862876892,
|
|
"num_tokens": 7803265.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"entropy": 5.866269779205322,
|
|
"epoch": 0.35622768325981935,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004991642163160252,
|
|
"loss": 5.8494,
|
|
"mean_token_accuracy": 0.14268068224191666,
|
|
"num_tokens": 7812445.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"entropy": 5.940937805175781,
|
|
"epoch": 0.35664776307498425,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004991616355535684,
|
|
"loss": 5.8433,
|
|
"mean_token_accuracy": 0.14912385791540145,
|
|
"num_tokens": 7822073.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"entropy": 5.912532901763916,
|
|
"epoch": 0.35706784289014915,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004991590508202036,
|
|
"loss": 5.8223,
|
|
"mean_token_accuracy": 0.14509293287992478,
|
|
"num_tokens": 7831193.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"entropy": 5.9145135402679445,
|
|
"epoch": 0.357487922705314,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004991564621159766,
|
|
"loss": 5.8496,
|
|
"mean_token_accuracy": 0.1429229497909546,
|
|
"num_tokens": 7840311.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"entropy": 5.863569116592407,
|
|
"epoch": 0.3579080025204789,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004991538694409334,
|
|
"loss": 5.8807,
|
|
"mean_token_accuracy": 0.13731356635689734,
|
|
"num_tokens": 7849622.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"entropy": 5.866101455688477,
|
|
"epoch": 0.3583280823356438,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004991512727951198,
|
|
"loss": 5.8355,
|
|
"mean_token_accuracy": 0.14157909527420998,
|
|
"num_tokens": 7859494.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"entropy": 6.04155158996582,
|
|
"epoch": 0.3587481621508087,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991486721785818,
|
|
"loss": 5.9463,
|
|
"mean_token_accuracy": 0.13783199936151505,
|
|
"num_tokens": 7868526.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"entropy": 5.926672124862671,
|
|
"epoch": 0.3591682419659735,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004991460675913655,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.145780511200428,
|
|
"num_tokens": 7877631.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"entropy": 5.883526086807251,
|
|
"epoch": 0.3595883217811384,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.000499143459033517,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.14927242919802666,
|
|
"num_tokens": 7886814.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"entropy": 5.745771408081055,
|
|
"epoch": 0.3600084015963033,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004991408465050825,
|
|
"loss": 5.6432,
|
|
"mean_token_accuracy": 0.1510879337787628,
|
|
"num_tokens": 7896337.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"entropy": 5.832517528533936,
|
|
"epoch": 0.36042848141146816,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004991382300061084,
|
|
"loss": 5.9099,
|
|
"mean_token_accuracy": 0.13439226746559144,
|
|
"num_tokens": 7906071.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"entropy": 5.962070560455322,
|
|
"epoch": 0.36084856122663306,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004991356095366409,
|
|
"loss": 5.8959,
|
|
"mean_token_accuracy": 0.14356234297156334,
|
|
"num_tokens": 7915003.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"entropy": 5.901134014129639,
|
|
"epoch": 0.36126864104179796,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004991329850967266,
|
|
"loss": 5.7553,
|
|
"mean_token_accuracy": 0.14801599234342575,
|
|
"num_tokens": 7924408.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"entropy": 5.79054274559021,
|
|
"epoch": 0.3616887208569628,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004991303566864118,
|
|
"loss": 5.7258,
|
|
"mean_token_accuracy": 0.14961198195815087,
|
|
"num_tokens": 7934717.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"entropy": 5.805612707138062,
|
|
"epoch": 0.3621088006721277,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004991277243057431,
|
|
"loss": 5.7802,
|
|
"mean_token_accuracy": 0.14127594009041786,
|
|
"num_tokens": 7944278.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"entropy": 5.852519369125366,
|
|
"epoch": 0.3625288804872926,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004991250879547673,
|
|
"loss": 5.8079,
|
|
"mean_token_accuracy": 0.147132358700037,
|
|
"num_tokens": 7953344.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"entropy": 5.814604616165161,
|
|
"epoch": 0.3629489603024575,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004991224476335309,
|
|
"loss": 5.8228,
|
|
"mean_token_accuracy": 0.14245626628398894,
|
|
"num_tokens": 7962869.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"entropy": 5.943966865539551,
|
|
"epoch": 0.36336904011762233,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004991198033420807,
|
|
"loss": 5.8144,
|
|
"mean_token_accuracy": 0.14347591027617454,
|
|
"num_tokens": 7971981.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"entropy": 5.808311653137207,
|
|
"epoch": 0.36378911993278723,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004991171550804636,
|
|
"loss": 5.7932,
|
|
"mean_token_accuracy": 0.14206481873989105,
|
|
"num_tokens": 7980979.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"entropy": 5.905320501327514,
|
|
"epoch": 0.36420919974795213,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004991145028487266,
|
|
"loss": 5.8545,
|
|
"mean_token_accuracy": 0.14277459532022477,
|
|
"num_tokens": 7989607.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"entropy": 5.837810134887695,
|
|
"epoch": 0.36462927956311697,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004991118466469165,
|
|
"loss": 5.6845,
|
|
"mean_token_accuracy": 0.1493909515440464,
|
|
"num_tokens": 7998356.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"entropy": 5.851336050033569,
|
|
"epoch": 0.36504935937828187,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004991091864750805,
|
|
"loss": 5.7847,
|
|
"mean_token_accuracy": 0.15024245530366898,
|
|
"num_tokens": 8007596.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"entropy": 5.901401472091675,
|
|
"epoch": 0.36546943919344677,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004991065223332655,
|
|
"loss": 5.8505,
|
|
"mean_token_accuracy": 0.14087127447128295,
|
|
"num_tokens": 8016493.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"entropy": 5.870211553573609,
|
|
"epoch": 0.36588951900861166,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004991038542215191,
|
|
"loss": 5.8227,
|
|
"mean_token_accuracy": 0.13674163743853568,
|
|
"num_tokens": 8025867.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"entropy": 5.875178384780884,
|
|
"epoch": 0.3663095988237765,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004991011821398882,
|
|
"loss": 5.8524,
|
|
"mean_token_accuracy": 0.14893437176942825,
|
|
"num_tokens": 8036251.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"entropy": 5.93429970741272,
|
|
"epoch": 0.3667296786389414,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004990985060884202,
|
|
"loss": 5.7934,
|
|
"mean_token_accuracy": 0.14601994007825853,
|
|
"num_tokens": 8045647.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"entropy": 5.862146234512329,
|
|
"epoch": 0.3671497584541063,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004990958260671627,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.13680800497531892,
|
|
"num_tokens": 8056025.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"entropy": 5.831518173217773,
|
|
"epoch": 0.36756983826927114,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004990931420761629,
|
|
"loss": 5.801,
|
|
"mean_token_accuracy": 0.14898931458592415,
|
|
"num_tokens": 8065029.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"entropy": 5.986507749557495,
|
|
"epoch": 0.36798991808443604,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990904541154685,
|
|
"loss": 5.7557,
|
|
"mean_token_accuracy": 0.1577958881855011,
|
|
"num_tokens": 8073249.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"entropy": 5.926363086700439,
|
|
"epoch": 0.36840999789960094,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990877621851271,
|
|
"loss": 5.8958,
|
|
"mean_token_accuracy": 0.14099107980728148,
|
|
"num_tokens": 8082039.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"entropy": 5.77527756690979,
|
|
"epoch": 0.3688300777147658,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004990850662851863,
|
|
"loss": 5.7314,
|
|
"mean_token_accuracy": 0.14939506947994233,
|
|
"num_tokens": 8090011.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"entropy": 5.903356552124023,
|
|
"epoch": 0.3692501575299307,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004990823664156941,
|
|
"loss": 5.8578,
|
|
"mean_token_accuracy": 0.1503001019358635,
|
|
"num_tokens": 8099934.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"entropy": 5.969454669952393,
|
|
"epoch": 0.3696702373450956,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990796625766981,
|
|
"loss": 5.8544,
|
|
"mean_token_accuracy": 0.14106032997369766,
|
|
"num_tokens": 8108969.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"entropy": 5.791495323181152,
|
|
"epoch": 0.3700903171602605,
|
|
"grad_norm": 1.171875,
|
|
"learning_rate": 0.0004990769547682462,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.14634729623794557,
|
|
"num_tokens": 8117372.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"entropy": 6.0099778175354,
|
|
"epoch": 0.3705103969754253,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004990742429903866,
|
|
"loss": 5.9586,
|
|
"mean_token_accuracy": 0.13980280980467796,
|
|
"num_tokens": 8127108.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"entropy": 5.983245754241944,
|
|
"epoch": 0.3709304767905902,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000499071527243167,
|
|
"loss": 5.9341,
|
|
"mean_token_accuracy": 0.13901495784521103,
|
|
"num_tokens": 8137392.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"entropy": 5.866679716110229,
|
|
"epoch": 0.3713505566057551,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004990688075266357,
|
|
"loss": 5.7869,
|
|
"mean_token_accuracy": 0.15010641515254974,
|
|
"num_tokens": 8146257.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"entropy": 5.818060064315796,
|
|
"epoch": 0.37177063642091995,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004990660838408409,
|
|
"loss": 5.7406,
|
|
"mean_token_accuracy": 0.14406471997499465,
|
|
"num_tokens": 8154952.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"entropy": 5.896796941757202,
|
|
"epoch": 0.37219071623608485,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990633561858308,
|
|
"loss": 5.8106,
|
|
"mean_token_accuracy": 0.14199803844094278,
|
|
"num_tokens": 8164365.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"entropy": 5.897768831253051,
|
|
"epoch": 0.37261079605124975,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990606245616537,
|
|
"loss": 5.806,
|
|
"mean_token_accuracy": 0.14310891702771186,
|
|
"num_tokens": 8172614.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"entropy": 5.942156267166138,
|
|
"epoch": 0.37303087586641465,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004990578889683579,
|
|
"loss": 5.8804,
|
|
"mean_token_accuracy": 0.1399701401591301,
|
|
"num_tokens": 8182445.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"entropy": 5.876403188705444,
|
|
"epoch": 0.3734509556815795,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004990551494059921,
|
|
"loss": 5.759,
|
|
"mean_token_accuracy": 0.14562231674790382,
|
|
"num_tokens": 8191871.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"entropy": 5.910711050033569,
|
|
"epoch": 0.3738710354967444,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990524058746047,
|
|
"loss": 5.9163,
|
|
"mean_token_accuracy": 0.14722812101244925,
|
|
"num_tokens": 8200658.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"entropy": 5.870158910751343,
|
|
"epoch": 0.3742911153119093,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004990496583742443,
|
|
"loss": 5.828,
|
|
"mean_token_accuracy": 0.13948734179139138,
|
|
"num_tokens": 8209776.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"entropy": 5.8510631084442135,
|
|
"epoch": 0.3747111951270741,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004990469069049596,
|
|
"loss": 5.7655,
|
|
"mean_token_accuracy": 0.14834618717432022,
|
|
"num_tokens": 8219401.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"entropy": 5.841431474685669,
|
|
"epoch": 0.375131274942239,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004990441514667993,
|
|
"loss": 5.8164,
|
|
"mean_token_accuracy": 0.14826851561665536,
|
|
"num_tokens": 8228762.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"entropy": 5.943975448608398,
|
|
"epoch": 0.3755513547574039,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004990413920598121,
|
|
"loss": 5.8117,
|
|
"mean_token_accuracy": 0.1469542607665062,
|
|
"num_tokens": 8236612.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"entropy": 5.879862689971924,
|
|
"epoch": 0.37597143457256876,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004990386286840471,
|
|
"loss": 5.831,
|
|
"mean_token_accuracy": 0.14322375506162643,
|
|
"num_tokens": 8245043.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"entropy": 6.000154876708985,
|
|
"epoch": 0.37639151438773366,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004990358613395532,
|
|
"loss": 5.9178,
|
|
"mean_token_accuracy": 0.14013071805238725,
|
|
"num_tokens": 8255270.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"entropy": 5.975326061248779,
|
|
"epoch": 0.37681159420289856,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004990330900263792,
|
|
"loss": 5.8795,
|
|
"mean_token_accuracy": 0.13827874809503554,
|
|
"num_tokens": 8264761.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"entropy": 5.914210987091065,
|
|
"epoch": 0.37723167401806346,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004990303147445745,
|
|
"loss": 5.8254,
|
|
"mean_token_accuracy": 0.14292607977986335,
|
|
"num_tokens": 8274308.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"entropy": 5.824473428726196,
|
|
"epoch": 0.3776517538332283,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004990275354941881,
|
|
"loss": 5.7135,
|
|
"mean_token_accuracy": 0.15530410706996917,
|
|
"num_tokens": 8283323.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"entropy": 5.8834668636322025,
|
|
"epoch": 0.3780718336483932,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004990247522752694,
|
|
"loss": 6.0456,
|
|
"mean_token_accuracy": 0.12988803088665007,
|
|
"num_tokens": 8293452.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"entropy": 5.955636644363404,
|
|
"epoch": 0.3784919134635581,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990219650878674,
|
|
"loss": 5.7218,
|
|
"mean_token_accuracy": 0.15242000669240952,
|
|
"num_tokens": 8302941.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"entropy": 5.847525358200073,
|
|
"epoch": 0.37891199327872294,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004990191739320318,
|
|
"loss": 5.7466,
|
|
"mean_token_accuracy": 0.1486208386719227,
|
|
"num_tokens": 8311811.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"entropy": 5.740224170684814,
|
|
"epoch": 0.37933207309388783,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004990163788078117,
|
|
"loss": 5.6461,
|
|
"mean_token_accuracy": 0.15186312943696975,
|
|
"num_tokens": 8321130.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"entropy": 5.8464127540588375,
|
|
"epoch": 0.37975215290905273,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004990135797152569,
|
|
"loss": 5.7849,
|
|
"mean_token_accuracy": 0.1421021416783333,
|
|
"num_tokens": 8330233.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"entropy": 5.841711711883545,
|
|
"epoch": 0.3801722327242176,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004990107766544169,
|
|
"loss": 5.7542,
|
|
"mean_token_accuracy": 0.14685214608907698,
|
|
"num_tokens": 8338585.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"entropy": 5.820804738998413,
|
|
"epoch": 0.38059231253938247,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004990079696253413,
|
|
"loss": 5.7772,
|
|
"mean_token_accuracy": 0.15359208285808562,
|
|
"num_tokens": 8346618.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"entropy": 5.856412267684936,
|
|
"epoch": 0.38101239235454737,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004990051586280799,
|
|
"loss": 5.7538,
|
|
"mean_token_accuracy": 0.14895081669092178,
|
|
"num_tokens": 8356273.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"entropy": 5.84465913772583,
|
|
"epoch": 0.38143247216971227,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004990023436626824,
|
|
"loss": 5.7524,
|
|
"mean_token_accuracy": 0.15021177157759666,
|
|
"num_tokens": 8366668.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"entropy": 5.9355755805969235,
|
|
"epoch": 0.3818525519848771,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004989995247291988,
|
|
"loss": 5.8869,
|
|
"mean_token_accuracy": 0.14657219648361205,
|
|
"num_tokens": 8375610.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"entropy": 5.892274522781372,
|
|
"epoch": 0.382272631800042,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004989967018276789,
|
|
"loss": 5.7508,
|
|
"mean_token_accuracy": 0.14968783855438234,
|
|
"num_tokens": 8384455.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"entropy": 5.7833092212677,
|
|
"epoch": 0.3826927116152069,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004989938749581727,
|
|
"loss": 5.7971,
|
|
"mean_token_accuracy": 0.14261699542403222,
|
|
"num_tokens": 8393868.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"entropy": 5.87280683517456,
|
|
"epoch": 0.38311279143037175,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004989910441207305,
|
|
"loss": 5.8159,
|
|
"mean_token_accuracy": 0.142460335791111,
|
|
"num_tokens": 8402916.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"entropy": 5.866139030456543,
|
|
"epoch": 0.38353287124553664,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004989882093154023,
|
|
"loss": 5.7386,
|
|
"mean_token_accuracy": 0.14703422486782075,
|
|
"num_tokens": 8411649.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"entropy": 5.867181587219238,
|
|
"epoch": 0.38395295106070154,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004989853705422381,
|
|
"loss": 5.8685,
|
|
"mean_token_accuracy": 0.14120551571249962,
|
|
"num_tokens": 8420393.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"entropy": 5.828695583343506,
|
|
"epoch": 0.38437303087586644,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989825278012886,
|
|
"loss": 5.7515,
|
|
"mean_token_accuracy": 0.14682712703943251,
|
|
"num_tokens": 8429404.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"entropy": 5.838935279846192,
|
|
"epoch": 0.3847931106910313,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000498979681092604,
|
|
"loss": 5.7956,
|
|
"mean_token_accuracy": 0.1419872298836708,
|
|
"num_tokens": 8438299.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"entropy": 5.8123194694519045,
|
|
"epoch": 0.3852131905061962,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004989768304162345,
|
|
"loss": 5.7252,
|
|
"mean_token_accuracy": 0.14794419556856156,
|
|
"num_tokens": 8447392.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"entropy": 5.904961681365966,
|
|
"epoch": 0.3856332703213611,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004989739757722308,
|
|
"loss": 5.8303,
|
|
"mean_token_accuracy": 0.14383739084005356,
|
|
"num_tokens": 8456361.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"entropy": 5.8575108528137205,
|
|
"epoch": 0.3860533501365259,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004989711171606436,
|
|
"loss": 5.7685,
|
|
"mean_token_accuracy": 0.14821998178958892,
|
|
"num_tokens": 8465548.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"entropy": 5.868388605117798,
|
|
"epoch": 0.3864734299516908,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004989682545815232,
|
|
"loss": 5.7918,
|
|
"mean_token_accuracy": 0.1475853756070137,
|
|
"num_tokens": 8474454.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"entropy": 5.851971101760864,
|
|
"epoch": 0.3868935097668557,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004989653880349207,
|
|
"loss": 5.702,
|
|
"mean_token_accuracy": 0.14926233142614365,
|
|
"num_tokens": 8482694.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"entropy": 5.900271463394165,
|
|
"epoch": 0.38731358958202056,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004989625175208864,
|
|
"loss": 5.8168,
|
|
"mean_token_accuracy": 0.1423931635916233,
|
|
"num_tokens": 8491162.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"entropy": 5.7257490158081055,
|
|
"epoch": 0.38773366939718545,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004989596430394717,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.15937959849834443,
|
|
"num_tokens": 8500716.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"entropy": 5.808234357833863,
|
|
"epoch": 0.38815374921235035,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.000498956764590727,
|
|
"loss": 5.708,
|
|
"mean_token_accuracy": 0.14701972305774688,
|
|
"num_tokens": 8508871.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"entropy": 5.95609483718872,
|
|
"epoch": 0.38857382902751525,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004989538821747037,
|
|
"loss": 5.9183,
|
|
"mean_token_accuracy": 0.14093246757984162,
|
|
"num_tokens": 8518450.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"entropy": 5.905833387374878,
|
|
"epoch": 0.3889939088426801,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004989509957914527,
|
|
"loss": 5.8226,
|
|
"mean_token_accuracy": 0.13851853534579278,
|
|
"num_tokens": 8528238.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"entropy": 5.809105253219604,
|
|
"epoch": 0.389413988657845,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004989481054410251,
|
|
"loss": 5.7101,
|
|
"mean_token_accuracy": 0.14374718815088272,
|
|
"num_tokens": 8537587.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"entropy": 5.856853580474853,
|
|
"epoch": 0.3898340684730099,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004989452111234721,
|
|
"loss": 5.832,
|
|
"mean_token_accuracy": 0.14598365053534507,
|
|
"num_tokens": 8547703.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"entropy": 5.896316909790039,
|
|
"epoch": 0.39025414828817473,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000498942312838845,
|
|
"loss": 5.7758,
|
|
"mean_token_accuracy": 0.14673489332199097,
|
|
"num_tokens": 8557001.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"entropy": 5.753481054306031,
|
|
"epoch": 0.3906742281033396,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004989394105871952,
|
|
"loss": 5.6574,
|
|
"mean_token_accuracy": 0.15356985479593277,
|
|
"num_tokens": 8565638.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"entropy": 5.9358145236969,
|
|
"epoch": 0.3910943079185045,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000498936504368574,
|
|
"loss": 5.824,
|
|
"mean_token_accuracy": 0.14542939141392708,
|
|
"num_tokens": 8574428.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"entropy": 5.850586032867431,
|
|
"epoch": 0.3915143877336694,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004989335941830329,
|
|
"loss": 5.7948,
|
|
"mean_token_accuracy": 0.14559997841715813,
|
|
"num_tokens": 8583157.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"entropy": 5.788503217697143,
|
|
"epoch": 0.39193446754883426,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004989306800306236,
|
|
"loss": 5.7506,
|
|
"mean_token_accuracy": 0.14149378091096879,
|
|
"num_tokens": 8592382.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"entropy": 5.7819007396697994,
|
|
"epoch": 0.39235454736399916,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004989277619113975,
|
|
"loss": 5.7277,
|
|
"mean_token_accuracy": 0.15354567617177964,
|
|
"num_tokens": 8601058.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"entropy": 5.888355016708374,
|
|
"epoch": 0.39277462717916406,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989248398254065,
|
|
"loss": 5.809,
|
|
"mean_token_accuracy": 0.1449069932103157,
|
|
"num_tokens": 8609479.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"entropy": 5.876518249511719,
|
|
"epoch": 0.3931947069943289,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004989219137727021,
|
|
"loss": 5.7915,
|
|
"mean_token_accuracy": 0.14826752841472626,
|
|
"num_tokens": 8618860.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"entropy": 5.84470911026001,
|
|
"epoch": 0.3936147868094938,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004989189837533365,
|
|
"loss": 5.735,
|
|
"mean_token_accuracy": 0.14821926057338713,
|
|
"num_tokens": 8627462.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"entropy": 5.9267487049102785,
|
|
"epoch": 0.3940348666246587,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004989160497673613,
|
|
"loss": 5.9044,
|
|
"mean_token_accuracy": 0.14269359409809113,
|
|
"num_tokens": 8637569.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"entropy": 5.871499490737915,
|
|
"epoch": 0.39445494643982354,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004989131118148286,
|
|
"loss": 5.717,
|
|
"mean_token_accuracy": 0.14633206725120546,
|
|
"num_tokens": 8645440.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"entropy": 5.8017088890075685,
|
|
"epoch": 0.39487502625498844,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004989101698957904,
|
|
"loss": 5.8682,
|
|
"mean_token_accuracy": 0.14381342753767967,
|
|
"num_tokens": 8655077.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"entropy": 5.897370004653931,
|
|
"epoch": 0.39529510607015333,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004989072240102988,
|
|
"loss": 5.802,
|
|
"mean_token_accuracy": 0.14655678868293762,
|
|
"num_tokens": 8663126.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"entropy": 5.9488218307495115,
|
|
"epoch": 0.39571518588531823,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004989042741584061,
|
|
"loss": 5.7704,
|
|
"mean_token_accuracy": 0.14519642665982246,
|
|
"num_tokens": 8672386.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"entropy": 5.720374536514282,
|
|
"epoch": 0.3961352657004831,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004989013203401645,
|
|
"loss": 5.7158,
|
|
"mean_token_accuracy": 0.14469049870967865,
|
|
"num_tokens": 8681930.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"entropy": 5.841011619567871,
|
|
"epoch": 0.396555345515648,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004988983625556264,
|
|
"loss": 5.7771,
|
|
"mean_token_accuracy": 0.14254847168922424,
|
|
"num_tokens": 8690993.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"entropy": 5.822189235687256,
|
|
"epoch": 0.39697542533081287,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004988954008048438,
|
|
"loss": 5.7542,
|
|
"mean_token_accuracy": 0.1459605447947979,
|
|
"num_tokens": 8699497.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"entropy": 5.995753383636474,
|
|
"epoch": 0.3973955051459777,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004988924350878697,
|
|
"loss": 5.9601,
|
|
"mean_token_accuracy": 0.13361823558807373,
|
|
"num_tokens": 8709274.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"entropy": 5.916924142837525,
|
|
"epoch": 0.3978155849611426,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004988894654047563,
|
|
"loss": 5.8045,
|
|
"mean_token_accuracy": 0.14044342935085297,
|
|
"num_tokens": 8718158.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"entropy": 5.780064105987549,
|
|
"epoch": 0.3982356647763075,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004988864917555562,
|
|
"loss": 5.6905,
|
|
"mean_token_accuracy": 0.147587950527668,
|
|
"num_tokens": 8727459.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"entropy": 5.872677993774414,
|
|
"epoch": 0.3986557445914724,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004988835141403224,
|
|
"loss": 5.838,
|
|
"mean_token_accuracy": 0.15111552625894548,
|
|
"num_tokens": 8737614.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"entropy": 5.793733787536621,
|
|
"epoch": 0.39907582440663725,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004988805325591073,
|
|
"loss": 5.6586,
|
|
"mean_token_accuracy": 0.14995396584272386,
|
|
"num_tokens": 8746799.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"entropy": 5.854796743392944,
|
|
"epoch": 0.39949590422180214,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004988775470119639,
|
|
"loss": 5.8437,
|
|
"mean_token_accuracy": 0.14196141958236694,
|
|
"num_tokens": 8756555.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"entropy": 5.811031866073608,
|
|
"epoch": 0.39991598403696704,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004988745574989451,
|
|
"loss": 5.8678,
|
|
"mean_token_accuracy": 0.14688000455498695,
|
|
"num_tokens": 8765849.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"entropy": 5.97343111038208,
|
|
"epoch": 0.4003360638521319,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004988715640201036,
|
|
"loss": 5.9254,
|
|
"mean_token_accuracy": 0.14007072225213052,
|
|
"num_tokens": 8775713.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"entropy": 5.833617496490478,
|
|
"epoch": 0.4007561436672968,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004988685665754928,
|
|
"loss": 5.757,
|
|
"mean_token_accuracy": 0.14587045535445214,
|
|
"num_tokens": 8784717.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"entropy": 5.850550556182862,
|
|
"epoch": 0.4011762234824617,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004988655651651656,
|
|
"loss": 5.7758,
|
|
"mean_token_accuracy": 0.14603159129619597,
|
|
"num_tokens": 8794388.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"entropy": 5.799873685836792,
|
|
"epoch": 0.4015963032976265,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004988625597891751,
|
|
"loss": 5.7832,
|
|
"mean_token_accuracy": 0.1439102217555046,
|
|
"num_tokens": 8802436.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"entropy": 5.901528596878052,
|
|
"epoch": 0.4020163831127914,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004988595504475746,
|
|
"loss": 5.7387,
|
|
"mean_token_accuracy": 0.14495839625597,
|
|
"num_tokens": 8811184.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"entropy": 5.891771650314331,
|
|
"epoch": 0.4024364629279563,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004988565371404175,
|
|
"loss": 5.8182,
|
|
"mean_token_accuracy": 0.14259228706359864,
|
|
"num_tokens": 8820525.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"entropy": 5.840786600112915,
|
|
"epoch": 0.4028565427431212,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004988535198677571,
|
|
"loss": 5.6717,
|
|
"mean_token_accuracy": 0.15582364946603774,
|
|
"num_tokens": 8828928.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"entropy": 5.857646703720093,
|
|
"epoch": 0.40327662255828606,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004988504986296469,
|
|
"loss": 5.9008,
|
|
"mean_token_accuracy": 0.1377402052283287,
|
|
"num_tokens": 8838615.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"entropy": 5.874919462203979,
|
|
"epoch": 0.40369670237345096,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004988474734261404,
|
|
"loss": 5.8723,
|
|
"mean_token_accuracy": 0.1379916787147522,
|
|
"num_tokens": 8848709.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"entropy": 5.955801677703858,
|
|
"epoch": 0.40411678218861585,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004988444442572911,
|
|
"loss": 5.8116,
|
|
"mean_token_accuracy": 0.1377601645886898,
|
|
"num_tokens": 8858277.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"entropy": 5.8524405002594,
|
|
"epoch": 0.4045368620037807,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004988414111231528,
|
|
"loss": 5.7865,
|
|
"mean_token_accuracy": 0.146932952105999,
|
|
"num_tokens": 8868436.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"entropy": 5.81365385055542,
|
|
"epoch": 0.4049569418189456,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000498838374023779,
|
|
"loss": 5.7632,
|
|
"mean_token_accuracy": 0.1458624616265297,
|
|
"num_tokens": 8877740.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"entropy": 5.897781801223755,
|
|
"epoch": 0.4053770216341105,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004988353329592239,
|
|
"loss": 5.7534,
|
|
"mean_token_accuracy": 0.1458469048142433,
|
|
"num_tokens": 8887408.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"entropy": 5.865758180618286,
|
|
"epoch": 0.4057971014492754,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004988322879295409,
|
|
"loss": 5.9214,
|
|
"mean_token_accuracy": 0.13947931975126265,
|
|
"num_tokens": 8897141.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"entropy": 5.81132230758667,
|
|
"epoch": 0.40621718126444023,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004988292389347844,
|
|
"loss": 5.6894,
|
|
"mean_token_accuracy": 0.153226038813591,
|
|
"num_tokens": 8905747.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"entropy": 5.9674177169799805,
|
|
"epoch": 0.40663726107960513,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000498826185975008,
|
|
"loss": 5.8416,
|
|
"mean_token_accuracy": 0.14231978505849838,
|
|
"num_tokens": 8914926.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"entropy": 5.819942045211792,
|
|
"epoch": 0.40705734089477,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004988231290502662,
|
|
"loss": 5.8576,
|
|
"mean_token_accuracy": 0.14279644340276718,
|
|
"num_tokens": 8923956.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"entropy": 5.920290994644165,
|
|
"epoch": 0.40747742070993487,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004988200681606127,
|
|
"loss": 5.724,
|
|
"mean_token_accuracy": 0.1409930519759655,
|
|
"num_tokens": 8932654.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"entropy": 5.8871392726898195,
|
|
"epoch": 0.40789750052509977,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000498817003306102,
|
|
"loss": 5.7162,
|
|
"mean_token_accuracy": 0.15066221207380295,
|
|
"num_tokens": 8941716.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"entropy": 5.792912864685059,
|
|
"epoch": 0.40831758034026466,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004988139344867884,
|
|
"loss": 5.7907,
|
|
"mean_token_accuracy": 0.14440437257289887,
|
|
"num_tokens": 8950377.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"entropy": 5.823495244979858,
|
|
"epoch": 0.4087376601554295,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004988108617027261,
|
|
"loss": 5.7438,
|
|
"mean_token_accuracy": 0.1429793991148472,
|
|
"num_tokens": 8959857.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"entropy": 5.811827516555786,
|
|
"epoch": 0.4091577399705944,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004988077849539698,
|
|
"loss": 5.6945,
|
|
"mean_token_accuracy": 0.14948356971144677,
|
|
"num_tokens": 8968272.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"entropy": 5.826247787475586,
|
|
"epoch": 0.4095778197857593,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004988047042405736,
|
|
"loss": 5.765,
|
|
"mean_token_accuracy": 0.1494896613061428,
|
|
"num_tokens": 8977445.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"entropy": 5.932396030426025,
|
|
"epoch": 0.4099978996009242,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004988016195625924,
|
|
"loss": 5.8399,
|
|
"mean_token_accuracy": 0.13975587710738183,
|
|
"num_tokens": 8987315.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"entropy": 5.874420547485352,
|
|
"epoch": 0.41041797941608904,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004987985309200807,
|
|
"loss": 5.8545,
|
|
"mean_token_accuracy": 0.14247876554727554,
|
|
"num_tokens": 8998119.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"entropy": 5.734928989410401,
|
|
"epoch": 0.41083805923125394,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004987954383130934,
|
|
"loss": 5.7211,
|
|
"mean_token_accuracy": 0.15389348119497298,
|
|
"num_tokens": 9007167.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"entropy": 5.831616592407227,
|
|
"epoch": 0.41125813904641884,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000498792341741685,
|
|
"loss": 5.7751,
|
|
"mean_token_accuracy": 0.14183046370744706,
|
|
"num_tokens": 9016690.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"entropy": 5.9142228126525875,
|
|
"epoch": 0.4116782188615837,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004987892412059106,
|
|
"loss": 5.8608,
|
|
"mean_token_accuracy": 0.14799081161618233,
|
|
"num_tokens": 9026117.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"entropy": 5.786413145065308,
|
|
"epoch": 0.4120982986767486,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004987861367058251,
|
|
"loss": 5.7333,
|
|
"mean_token_accuracy": 0.14700580313801764,
|
|
"num_tokens": 9035754.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"entropy": 5.831767606735229,
|
|
"epoch": 0.4125183784919135,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004987830282414833,
|
|
"loss": 5.7383,
|
|
"mean_token_accuracy": 0.1511758364737034,
|
|
"num_tokens": 9045453.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"entropy": 5.920518159866333,
|
|
"epoch": 0.41293845830707837,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004987799158129404,
|
|
"loss": 5.866,
|
|
"mean_token_accuracy": 0.14217546358704566,
|
|
"num_tokens": 9056045.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"entropy": 5.811709785461426,
|
|
"epoch": 0.4133585381222432,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004987767994202516,
|
|
"loss": 5.7367,
|
|
"mean_token_accuracy": 0.14320328831672668,
|
|
"num_tokens": 9065728.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"entropy": 5.832678318023682,
|
|
"epoch": 0.4137786179374081,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004987736790634719,
|
|
"loss": 5.7682,
|
|
"mean_token_accuracy": 0.14474476575851442,
|
|
"num_tokens": 9075522.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"entropy": 5.789442873001098,
|
|
"epoch": 0.414198697752573,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004987705547426568,
|
|
"loss": 5.7403,
|
|
"mean_token_accuracy": 0.141367207467556,
|
|
"num_tokens": 9084412.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"entropy": 5.866390943527222,
|
|
"epoch": 0.41461877756773785,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004987674264578615,
|
|
"loss": 5.8202,
|
|
"mean_token_accuracy": 0.13946182429790496,
|
|
"num_tokens": 9094289.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"entropy": 5.853621578216552,
|
|
"epoch": 0.41503885738290275,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004987642942091414,
|
|
"loss": 5.7305,
|
|
"mean_token_accuracy": 0.1456735163927078,
|
|
"num_tokens": 9103124.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"entropy": 5.831802606582642,
|
|
"epoch": 0.41545893719806765,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004987611579965523,
|
|
"loss": 5.6742,
|
|
"mean_token_accuracy": 0.14086953178048134,
|
|
"num_tokens": 9112794.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"entropy": 5.873828983306884,
|
|
"epoch": 0.4158790170132325,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987580178201492,
|
|
"loss": 5.8342,
|
|
"mean_token_accuracy": 0.1499299481511116,
|
|
"num_tokens": 9122718.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"entropy": 5.850664281845093,
|
|
"epoch": 0.4162990968283974,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004987548736799882,
|
|
"loss": 5.8516,
|
|
"mean_token_accuracy": 0.14340481981635095,
|
|
"num_tokens": 9131855.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"entropy": 5.820421504974365,
|
|
"epoch": 0.4167191766435623,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004987517255761248,
|
|
"loss": 5.6959,
|
|
"mean_token_accuracy": 0.15145303905010224,
|
|
"num_tokens": 9141102.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"entropy": 5.806706857681275,
|
|
"epoch": 0.4171392564587272,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004987485735086148,
|
|
"loss": 5.7767,
|
|
"mean_token_accuracy": 0.14581410586833954,
|
|
"num_tokens": 9150552.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"entropy": 5.901687812805176,
|
|
"epoch": 0.417559336273892,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000498745417477514,
|
|
"loss": 5.7678,
|
|
"mean_token_accuracy": 0.1465201199054718,
|
|
"num_tokens": 9160105.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"entropy": 5.833481121063232,
|
|
"epoch": 0.4179794160890569,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004987422574828784,
|
|
"loss": 5.7442,
|
|
"mean_token_accuracy": 0.14697531685233117,
|
|
"num_tokens": 9169367.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"entropy": 5.801551008224488,
|
|
"epoch": 0.4183994959042218,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004987390935247639,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.1524960733950138,
|
|
"num_tokens": 9177872.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"entropy": 5.88420295715332,
|
|
"epoch": 0.41881957571938666,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004987359256032265,
|
|
"loss": 5.842,
|
|
"mean_token_accuracy": 0.13641551584005357,
|
|
"num_tokens": 9187879.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"entropy": 5.839552402496338,
|
|
"epoch": 0.41923965553455156,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004987327537183225,
|
|
"loss": 5.7627,
|
|
"mean_token_accuracy": 0.14583497866988182,
|
|
"num_tokens": 9198281.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"entropy": 5.830025053024292,
|
|
"epoch": 0.41965973534971646,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004987295778701078,
|
|
"loss": 5.7616,
|
|
"mean_token_accuracy": 0.1472972884774208,
|
|
"num_tokens": 9207670.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"entropy": 5.890192222595215,
|
|
"epoch": 0.42007981516488135,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.000498726398058639,
|
|
"loss": 5.7624,
|
|
"mean_token_accuracy": 0.1478295773267746,
|
|
"num_tokens": 9216995.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"entropy": 5.86348524093628,
|
|
"epoch": 0.4204998949800462,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004987232142839723,
|
|
"loss": 5.84,
|
|
"mean_token_accuracy": 0.14336878657341004,
|
|
"num_tokens": 9227330.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"entropy": 5.881588125228882,
|
|
"epoch": 0.4209199747952111,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004987200265461638,
|
|
"loss": 5.765,
|
|
"mean_token_accuracy": 0.15374772921204566,
|
|
"num_tokens": 9236666.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"entropy": 5.874930953979492,
|
|
"epoch": 0.421340054610376,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004987168348452705,
|
|
"loss": 5.7753,
|
|
"mean_token_accuracy": 0.14532062411308289,
|
|
"num_tokens": 9246388.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"entropy": 5.783464574813843,
|
|
"epoch": 0.42176013442554083,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004987136391813485,
|
|
"loss": 5.7285,
|
|
"mean_token_accuracy": 0.15355198979377746,
|
|
"num_tokens": 9255239.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"entropy": 5.740076160430908,
|
|
"epoch": 0.42218021424070573,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004987104395544547,
|
|
"loss": 5.7036,
|
|
"mean_token_accuracy": 0.14613911658525466,
|
|
"num_tokens": 9264468.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"entropy": 5.791272926330566,
|
|
"epoch": 0.42260029405587063,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004987072359646455,
|
|
"loss": 5.767,
|
|
"mean_token_accuracy": 0.15074100941419602,
|
|
"num_tokens": 9274140.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"entropy": 5.844473838806152,
|
|
"epoch": 0.42302037387103547,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004987040284119778,
|
|
"loss": 5.7527,
|
|
"mean_token_accuracy": 0.14496962279081343,
|
|
"num_tokens": 9283539.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"entropy": 5.793737411499023,
|
|
"epoch": 0.42344045368620037,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004987008168965087,
|
|
"loss": 5.7492,
|
|
"mean_token_accuracy": 0.1446337193250656,
|
|
"num_tokens": 9292664.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"entropy": 5.882410097122192,
|
|
"epoch": 0.42386053350136527,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004986976014182946,
|
|
"loss": 5.8632,
|
|
"mean_token_accuracy": 0.14475177749991416,
|
|
"num_tokens": 9302814.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"entropy": 5.91447172164917,
|
|
"epoch": 0.42428061331653016,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004986943819773927,
|
|
"loss": 5.8446,
|
|
"mean_token_accuracy": 0.13993989303708076,
|
|
"num_tokens": 9312654.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"entropy": 5.944200277328491,
|
|
"epoch": 0.424700693131695,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.00049869115857386,
|
|
"loss": 5.8629,
|
|
"mean_token_accuracy": 0.13871566727757453,
|
|
"num_tokens": 9322271.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"entropy": 5.914797401428222,
|
|
"epoch": 0.4251207729468599,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004986879312077536,
|
|
"loss": 5.7991,
|
|
"mean_token_accuracy": 0.14315683469176294,
|
|
"num_tokens": 9331341.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"entropy": 5.854491138458252,
|
|
"epoch": 0.4255408527620248,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004986846998791308,
|
|
"loss": 5.7292,
|
|
"mean_token_accuracy": 0.1450161539018154,
|
|
"num_tokens": 9339863.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"entropy": 5.760708379745483,
|
|
"epoch": 0.42596093257718964,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004986814645880485,
|
|
"loss": 5.7122,
|
|
"mean_token_accuracy": 0.14831122979521752,
|
|
"num_tokens": 9349488.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"entropy": 5.763440895080566,
|
|
"epoch": 0.42638101239235454,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004986782253345645,
|
|
"loss": 5.7155,
|
|
"mean_token_accuracy": 0.1446495160460472,
|
|
"num_tokens": 9357977.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"entropy": 5.873974847793579,
|
|
"epoch": 0.42680109220751944,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004986749821187358,
|
|
"loss": 5.8291,
|
|
"mean_token_accuracy": 0.14651158899068834,
|
|
"num_tokens": 9367449.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"entropy": 5.939826440811157,
|
|
"epoch": 0.42722117202268434,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00049867173494062,
|
|
"loss": 5.8443,
|
|
"mean_token_accuracy": 0.14635560363531114,
|
|
"num_tokens": 9377070.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"entropy": 5.814030504226684,
|
|
"epoch": 0.4276412518378492,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004986684838002744,
|
|
"loss": 5.6402,
|
|
"mean_token_accuracy": 0.14356765672564506,
|
|
"num_tokens": 9385881.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"entropy": 5.812148237228394,
|
|
"epoch": 0.4280613316530141,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004986652286977569,
|
|
"loss": 5.7935,
|
|
"mean_token_accuracy": 0.14297776967287062,
|
|
"num_tokens": 9395159.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"entropy": 5.8577290058135985,
|
|
"epoch": 0.428481411468179,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004986619696331252,
|
|
"loss": 5.7352,
|
|
"mean_token_accuracy": 0.14707281142473222,
|
|
"num_tokens": 9404590.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"entropy": 5.849812984466553,
|
|
"epoch": 0.4289014912833438,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004986587066064367,
|
|
"loss": 5.7485,
|
|
"mean_token_accuracy": 0.15095358043909074,
|
|
"num_tokens": 9414452.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"entropy": 5.895452976226807,
|
|
"epoch": 0.4293215710985087,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004986554396177494,
|
|
"loss": 5.8792,
|
|
"mean_token_accuracy": 0.136289519071579,
|
|
"num_tokens": 9424004.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"entropy": 5.892696142196655,
|
|
"epoch": 0.4297416509136736,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004986521686671212,
|
|
"loss": 5.7555,
|
|
"mean_token_accuracy": 0.1541634440422058,
|
|
"num_tokens": 9433487.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"entropy": 5.878170013427734,
|
|
"epoch": 0.43016173072883845,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00049864889375461,
|
|
"loss": 5.8153,
|
|
"mean_token_accuracy": 0.14046706035733222,
|
|
"num_tokens": 9442742.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"entropy": 5.880551862716675,
|
|
"epoch": 0.43058181054400335,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004986456148802738,
|
|
"loss": 5.8803,
|
|
"mean_token_accuracy": 0.1430477738380432,
|
|
"num_tokens": 9452550.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"entropy": 5.947691774368286,
|
|
"epoch": 0.43100189035916825,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004986423320441707,
|
|
"loss": 5.8325,
|
|
"mean_token_accuracy": 0.13825444877147675,
|
|
"num_tokens": 9461920.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"entropy": 5.91607780456543,
|
|
"epoch": 0.43142197017433315,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004986390452463588,
|
|
"loss": 5.7649,
|
|
"mean_token_accuracy": 0.1430374413728714,
|
|
"num_tokens": 9470817.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"entropy": 5.730179119110107,
|
|
"epoch": 0.431842049989498,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004986357544868964,
|
|
"loss": 5.7124,
|
|
"mean_token_accuracy": 0.15054248571395873,
|
|
"num_tokens": 9479936.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"entropy": 5.873006772994995,
|
|
"epoch": 0.4322621298046629,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004986324597658418,
|
|
"loss": 5.7327,
|
|
"mean_token_accuracy": 0.15253930985927583,
|
|
"num_tokens": 9489818.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"entropy": 5.753958749771118,
|
|
"epoch": 0.4326822096198278,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004986291610832533,
|
|
"loss": 5.7373,
|
|
"mean_token_accuracy": 0.1437373712658882,
|
|
"num_tokens": 9499688.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"entropy": 5.927468347549438,
|
|
"epoch": 0.4331022894349926,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004986258584391892,
|
|
"loss": 5.8034,
|
|
"mean_token_accuracy": 0.14375736117362975,
|
|
"num_tokens": 9509581.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"entropy": 5.965673732757568,
|
|
"epoch": 0.4335223692501575,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004986225518337084,
|
|
"loss": 5.8699,
|
|
"mean_token_accuracy": 0.14116424545645714,
|
|
"num_tokens": 9518556.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"entropy": 5.785204839706421,
|
|
"epoch": 0.4339424490653224,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004986192412668692,
|
|
"loss": 5.7774,
|
|
"mean_token_accuracy": 0.14554062783718108,
|
|
"num_tokens": 9527612.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"entropy": 5.780537843704224,
|
|
"epoch": 0.4343625288804873,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004986159267387302,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.15324972867965697,
|
|
"num_tokens": 9535882.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"entropy": 5.812596940994263,
|
|
"epoch": 0.43478260869565216,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004986126082493502,
|
|
"loss": 5.7687,
|
|
"mean_token_accuracy": 0.15061265081167222,
|
|
"num_tokens": 9544799.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"entropy": 5.831722640991211,
|
|
"epoch": 0.43520268851081706,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004986092857987881,
|
|
"loss": 5.6862,
|
|
"mean_token_accuracy": 0.14822531789541243,
|
|
"num_tokens": 9553805.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"entropy": 5.813159799575805,
|
|
"epoch": 0.43562276832598196,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004986059593871026,
|
|
"loss": 5.717,
|
|
"mean_token_accuracy": 0.14472756162285805,
|
|
"num_tokens": 9563493.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"entropy": 5.843486309051514,
|
|
"epoch": 0.4360428481411468,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004986026290143527,
|
|
"loss": 5.8036,
|
|
"mean_token_accuracy": 0.1416473552584648,
|
|
"num_tokens": 9572297.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"entropy": 5.981090354919433,
|
|
"epoch": 0.4364629279563117,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004985992946805973,
|
|
"loss": 5.944,
|
|
"mean_token_accuracy": 0.13955255076289177,
|
|
"num_tokens": 9581967.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"entropy": 5.916019868850708,
|
|
"epoch": 0.4368830077714766,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004985959563858955,
|
|
"loss": 5.8377,
|
|
"mean_token_accuracy": 0.14462848305702208,
|
|
"num_tokens": 9590885.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"entropy": 5.895185899734497,
|
|
"epoch": 0.43730308758664144,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004985926141303066,
|
|
"loss": 5.7664,
|
|
"mean_token_accuracy": 0.14494283646345138,
|
|
"num_tokens": 9599247.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"entropy": 5.799692010879516,
|
|
"epoch": 0.43772316740180633,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004985892679138896,
|
|
"loss": 5.6813,
|
|
"mean_token_accuracy": 0.15228856280446051,
|
|
"num_tokens": 9608296.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"entropy": 5.7856512546539305,
|
|
"epoch": 0.43814324721697123,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004985859177367038,
|
|
"loss": 5.743,
|
|
"mean_token_accuracy": 0.14503268152475357,
|
|
"num_tokens": 9616734.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"entropy": 5.923814201354981,
|
|
"epoch": 0.43856332703213613,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004985825635988087,
|
|
"loss": 5.8284,
|
|
"mean_token_accuracy": 0.14173559248447418,
|
|
"num_tokens": 9626246.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"entropy": 5.93772325515747,
|
|
"epoch": 0.43898340684730097,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.0004985792055002635,
|
|
"loss": 5.6928,
|
|
"mean_token_accuracy": 0.14719630777835846,
|
|
"num_tokens": 9634963.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"entropy": 5.874487066268921,
|
|
"epoch": 0.43940348666246587,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004985758434411278,
|
|
"loss": 5.7792,
|
|
"mean_token_accuracy": 0.1490402415394783,
|
|
"num_tokens": 9643615.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"entropy": 5.830899858474732,
|
|
"epoch": 0.43982356647763077,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004985724774214613,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.14308954402804375,
|
|
"num_tokens": 9653306.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"entropy": 5.849113464355469,
|
|
"epoch": 0.4402436462927956,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004985691074413233,
|
|
"loss": 5.7907,
|
|
"mean_token_accuracy": 0.14236897826194764,
|
|
"num_tokens": 9662389.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"entropy": 5.755652236938476,
|
|
"epoch": 0.4406637261079605,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004985657335007739,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.1460557647049427,
|
|
"num_tokens": 9671183.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"entropy": 5.91265082359314,
|
|
"epoch": 0.4410838059231254,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004985623555998725,
|
|
"loss": 5.7558,
|
|
"mean_token_accuracy": 0.15076574087142944,
|
|
"num_tokens": 9680544.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"entropy": 5.915414428710937,
|
|
"epoch": 0.4415038857382903,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004985589737386791,
|
|
"loss": 5.7849,
|
|
"mean_token_accuracy": 0.14628923386335374,
|
|
"num_tokens": 9690137.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"entropy": 5.76580057144165,
|
|
"epoch": 0.44192396555345514,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004985555879172535,
|
|
"loss": 5.718,
|
|
"mean_token_accuracy": 0.14928821474313736,
|
|
"num_tokens": 9699149.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"entropy": 5.85457215309143,
|
|
"epoch": 0.44234404536862004,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000498552198135656,
|
|
"loss": 5.8061,
|
|
"mean_token_accuracy": 0.1479725457727909,
|
|
"num_tokens": 9709308.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"entropy": 5.86218228340149,
|
|
"epoch": 0.44276412518378494,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004985488043939462,
|
|
"loss": 5.7433,
|
|
"mean_token_accuracy": 0.14763879179954528,
|
|
"num_tokens": 9718462.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"entropy": 5.848517847061157,
|
|
"epoch": 0.4431842049989498,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004985454066921846,
|
|
"loss": 5.6657,
|
|
"mean_token_accuracy": 0.15294953137636186,
|
|
"num_tokens": 9727626.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"entropy": 5.697809362411499,
|
|
"epoch": 0.4436042848141147,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004985420050304312,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.14653837010264398,
|
|
"num_tokens": 9737091.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"entropy": 5.776859283447266,
|
|
"epoch": 0.4440243646292796,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004985385994087462,
|
|
"loss": 5.7667,
|
|
"mean_token_accuracy": 0.14408197328448297,
|
|
"num_tokens": 9746135.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"entropy": 5.90691499710083,
|
|
"epoch": 0.4444444444444444,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004985351898271901,
|
|
"loss": 5.6909,
|
|
"mean_token_accuracy": 0.14973507225513458,
|
|
"num_tokens": 9754549.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"entropy": 5.90071930885315,
|
|
"epoch": 0.4448645242596093,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004985317762858231,
|
|
"loss": 5.8522,
|
|
"mean_token_accuracy": 0.1393520101904869,
|
|
"num_tokens": 9764219.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"entropy": 5.850748586654663,
|
|
"epoch": 0.4452846040747742,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000498528358784706,
|
|
"loss": 5.6867,
|
|
"mean_token_accuracy": 0.15024393796920776,
|
|
"num_tokens": 9772234.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"entropy": 5.805743646621704,
|
|
"epoch": 0.4457046838899391,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.000498524937323899,
|
|
"loss": 5.7412,
|
|
"mean_token_accuracy": 0.1513320118188858,
|
|
"num_tokens": 9781417.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"entropy": 5.939965295791626,
|
|
"epoch": 0.44612476370510395,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004985215119034628,
|
|
"loss": 5.8569,
|
|
"mean_token_accuracy": 0.14016103446483613,
|
|
"num_tokens": 9791286.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"entropy": 5.8461981296539305,
|
|
"epoch": 0.44654484352026885,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004985180825234582,
|
|
"loss": 5.8526,
|
|
"mean_token_accuracy": 0.14801667556166648,
|
|
"num_tokens": 9802157.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"entropy": 5.973970174789429,
|
|
"epoch": 0.44696492333543375,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985146491839459,
|
|
"loss": 5.8417,
|
|
"mean_token_accuracy": 0.1345731109380722,
|
|
"num_tokens": 9812646.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"entropy": 5.948847103118896,
|
|
"epoch": 0.4473850031505986,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004985112118849865,
|
|
"loss": 5.8516,
|
|
"mean_token_accuracy": 0.13398924767971038,
|
|
"num_tokens": 9822274.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"entropy": 5.782685327529907,
|
|
"epoch": 0.4478050829657635,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004985077706266412,
|
|
"loss": 5.6553,
|
|
"mean_token_accuracy": 0.14450234919786453,
|
|
"num_tokens": 9831337.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"entropy": 5.838633728027344,
|
|
"epoch": 0.4482251627809284,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004985043254089708,
|
|
"loss": 5.7708,
|
|
"mean_token_accuracy": 0.1397250510752201,
|
|
"num_tokens": 9840798.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"entropy": 5.778713369369507,
|
|
"epoch": 0.44864524259609323,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004985008762320364,
|
|
"loss": 5.7606,
|
|
"mean_token_accuracy": 0.14193158969283104,
|
|
"num_tokens": 9850117.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"entropy": 5.850035953521728,
|
|
"epoch": 0.4490653224112581,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000498497423095899,
|
|
"loss": 5.7103,
|
|
"mean_token_accuracy": 0.1520538941025734,
|
|
"num_tokens": 9858227.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"entropy": 5.814627599716187,
|
|
"epoch": 0.449485402226423,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004984939660006199,
|
|
"loss": 5.7869,
|
|
"mean_token_accuracy": 0.14539863988757135,
|
|
"num_tokens": 9867157.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"entropy": 5.7682719230651855,
|
|
"epoch": 0.4499054820415879,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004984905049462602,
|
|
"loss": 5.7017,
|
|
"mean_token_accuracy": 0.14406744837760926,
|
|
"num_tokens": 9877045.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"entropy": 5.952353000640869,
|
|
"epoch": 0.45032556185675277,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004984870399328814,
|
|
"loss": 5.8435,
|
|
"mean_token_accuracy": 0.14134326651692392,
|
|
"num_tokens": 9886637.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"entropy": 5.7878330707550045,
|
|
"epoch": 0.45074564167191766,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004984835709605446,
|
|
"loss": 5.7137,
|
|
"mean_token_accuracy": 0.15287383496761323,
|
|
"num_tokens": 9895601.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"entropy": 5.821693420410156,
|
|
"epoch": 0.45116572148708256,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004984800980293116,
|
|
"loss": 5.8616,
|
|
"mean_token_accuracy": 0.14498891532421113,
|
|
"num_tokens": 9904775.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"entropy": 5.841092920303344,
|
|
"epoch": 0.4515858013022474,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004984766211392435,
|
|
"loss": 5.7982,
|
|
"mean_token_accuracy": 0.14115200862288474,
|
|
"num_tokens": 9913795.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"entropy": 5.857733345031738,
|
|
"epoch": 0.4520058811174123,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004984731402904024,
|
|
"loss": 5.6297,
|
|
"mean_token_accuracy": 0.15439519211649894,
|
|
"num_tokens": 9922576.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"entropy": 5.776189756393433,
|
|
"epoch": 0.4524259609325772,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004984696554828496,
|
|
"loss": 5.6223,
|
|
"mean_token_accuracy": 0.1497255176305771,
|
|
"num_tokens": 9930971.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"entropy": 5.789079189300537,
|
|
"epoch": 0.4528460407477421,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004984661667166468,
|
|
"loss": 5.7472,
|
|
"mean_token_accuracy": 0.15051774978637694,
|
|
"num_tokens": 9939628.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"entropy": 5.814688301086425,
|
|
"epoch": 0.45326612056290694,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004984626739918561,
|
|
"loss": 5.7234,
|
|
"mean_token_accuracy": 0.15411069244146347,
|
|
"num_tokens": 9948397.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"entropy": 5.903667163848877,
|
|
"epoch": 0.45368620037807184,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004984591773085391,
|
|
"loss": 5.7883,
|
|
"mean_token_accuracy": 0.14472548216581343,
|
|
"num_tokens": 9957683.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"entropy": 5.878373336791992,
|
|
"epoch": 0.45410628019323673,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004984556766667578,
|
|
"loss": 5.775,
|
|
"mean_token_accuracy": 0.14814501702785493,
|
|
"num_tokens": 9966756.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"entropy": 5.782114458084107,
|
|
"epoch": 0.4545263600084016,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004984521720665743,
|
|
"loss": 5.7889,
|
|
"mean_token_accuracy": 0.14963518232107162,
|
|
"num_tokens": 9976000.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"entropy": 5.890175533294678,
|
|
"epoch": 0.4549464398235665,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004984486635080507,
|
|
"loss": 5.7788,
|
|
"mean_token_accuracy": 0.14989694356918334,
|
|
"num_tokens": 9985509.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"entropy": 5.741406440734863,
|
|
"epoch": 0.45536651963873137,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004984451509912489,
|
|
"loss": 5.7261,
|
|
"mean_token_accuracy": 0.14937452524900435,
|
|
"num_tokens": 9994342.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"entropy": 5.76695761680603,
|
|
"epoch": 0.4557865994538962,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004984416345162315,
|
|
"loss": 5.7717,
|
|
"mean_token_accuracy": 0.14712392613291742,
|
|
"num_tokens": 10004249.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"entropy": 5.824491500854492,
|
|
"epoch": 0.4562066792690611,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004984381140830605,
|
|
"loss": 5.7345,
|
|
"mean_token_accuracy": 0.14771454110741616,
|
|
"num_tokens": 10012430.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"entropy": 5.907290077209472,
|
|
"epoch": 0.456626759084226,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004984345896917984,
|
|
"loss": 5.745,
|
|
"mean_token_accuracy": 0.14652188569307328,
|
|
"num_tokens": 10021434.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"entropy": 5.826172637939453,
|
|
"epoch": 0.4570468388993909,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004984310613425076,
|
|
"loss": 5.7465,
|
|
"mean_token_accuracy": 0.1499892771244049,
|
|
"num_tokens": 10030473.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"entropy": 5.826717472076416,
|
|
"epoch": 0.45746691871455575,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004984275290352506,
|
|
"loss": 5.7186,
|
|
"mean_token_accuracy": 0.15438036620616913,
|
|
"num_tokens": 10039057.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"entropy": 5.879296159744262,
|
|
"epoch": 0.45788699852972065,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004984239927700899,
|
|
"loss": 5.8158,
|
|
"mean_token_accuracy": 0.14881263822317123,
|
|
"num_tokens": 10047998.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"entropy": 5.906938457489014,
|
|
"epoch": 0.45830707834488554,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004984204525470883,
|
|
"loss": 5.7609,
|
|
"mean_token_accuracy": 0.14349722415208815,
|
|
"num_tokens": 10057479.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"entropy": 5.797386360168457,
|
|
"epoch": 0.4587271581600504,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004984169083663084,
|
|
"loss": 5.7284,
|
|
"mean_token_accuracy": 0.14215826913714408,
|
|
"num_tokens": 10067754.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"entropy": 5.769140672683716,
|
|
"epoch": 0.4591472379752153,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004984133602278129,
|
|
"loss": 5.8074,
|
|
"mean_token_accuracy": 0.14632865488529206,
|
|
"num_tokens": 10076815.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"entropy": 5.929787874221802,
|
|
"epoch": 0.4595673177903802,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000498409808131665,
|
|
"loss": 5.8172,
|
|
"mean_token_accuracy": 0.14295997619628906,
|
|
"num_tokens": 10086300.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"entropy": 5.820594406127929,
|
|
"epoch": 0.4599873976055451,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004984062520779272,
|
|
"loss": 5.714,
|
|
"mean_token_accuracy": 0.15191168189048768,
|
|
"num_tokens": 10095383.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"entropy": 5.724618530273437,
|
|
"epoch": 0.4604074774207099,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004984026920666628,
|
|
"loss": 5.6894,
|
|
"mean_token_accuracy": 0.14800985455513,
|
|
"num_tokens": 10103971.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"entropy": 5.801212739944458,
|
|
"epoch": 0.4608275572358748,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004983991280979347,
|
|
"loss": 5.6852,
|
|
"mean_token_accuracy": 0.15083771497011184,
|
|
"num_tokens": 10113028.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"entropy": 5.8347760200500485,
|
|
"epoch": 0.4612476370510397,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004983955601718061,
|
|
"loss": 5.6653,
|
|
"mean_token_accuracy": 0.14897917956113815,
|
|
"num_tokens": 10121890.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"entropy": 5.866452312469482,
|
|
"epoch": 0.46166771686620456,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983919882883401,
|
|
"loss": 5.792,
|
|
"mean_token_accuracy": 0.14483870565891266,
|
|
"num_tokens": 10131655.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"entropy": 5.878374290466309,
|
|
"epoch": 0.46208779668136946,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004983884124476,
|
|
"loss": 5.7823,
|
|
"mean_token_accuracy": 0.1475811406970024,
|
|
"num_tokens": 10140778.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"entropy": 5.852365493774414,
|
|
"epoch": 0.46250787649653435,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004983848326496494,
|
|
"loss": 5.841,
|
|
"mean_token_accuracy": 0.14311327114701272,
|
|
"num_tokens": 10150229.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"entropy": 5.915929079055786,
|
|
"epoch": 0.4629279563116992,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004983812488945513,
|
|
"loss": 5.7457,
|
|
"mean_token_accuracy": 0.1459451988339424,
|
|
"num_tokens": 10158939.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"entropy": 5.816072797775268,
|
|
"epoch": 0.4633480361268641,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004983776611823696,
|
|
"loss": 5.7201,
|
|
"mean_token_accuracy": 0.1463111788034439,
|
|
"num_tokens": 10168383.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"entropy": 5.759098815917969,
|
|
"epoch": 0.463768115942029,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004983740695131676,
|
|
"loss": 5.7315,
|
|
"mean_token_accuracy": 0.14871828705072404,
|
|
"num_tokens": 10178678.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"entropy": 5.8468421459197994,
|
|
"epoch": 0.4641881957571939,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.000498370473887009,
|
|
"loss": 5.7383,
|
|
"mean_token_accuracy": 0.14837254881858825,
|
|
"num_tokens": 10188964.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"entropy": 5.893353366851807,
|
|
"epoch": 0.46460827557235873,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004983668743039573,
|
|
"loss": 5.7476,
|
|
"mean_token_accuracy": 0.15488529577851295,
|
|
"num_tokens": 10198333.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"entropy": 5.760245847702026,
|
|
"epoch": 0.46502835538752363,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004983632707640766,
|
|
"loss": 5.7603,
|
|
"mean_token_accuracy": 0.14777156189084054,
|
|
"num_tokens": 10207876.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"entropy": 5.78720121383667,
|
|
"epoch": 0.4654484352026885,
|
|
"grad_norm": 1.1796875,
|
|
"learning_rate": 0.0004983596632674306,
|
|
"loss": 5.699,
|
|
"mean_token_accuracy": 0.15150602906942368,
|
|
"num_tokens": 10216822.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"entropy": 5.902374649047852,
|
|
"epoch": 0.46586851501785337,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004983560518140831,
|
|
"loss": 5.8282,
|
|
"mean_token_accuracy": 0.13706823736429213,
|
|
"num_tokens": 10226887.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"entropy": 5.864826297760009,
|
|
"epoch": 0.46628859483301827,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004983524364040982,
|
|
"loss": 5.6734,
|
|
"mean_token_accuracy": 0.1513754189014435,
|
|
"num_tokens": 10235935.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"entropy": 5.801887845993042,
|
|
"epoch": 0.46670867464818316,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004983488170375399,
|
|
"loss": 5.6238,
|
|
"mean_token_accuracy": 0.1499588668346405,
|
|
"num_tokens": 10245590.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"entropy": 5.719043874740601,
|
|
"epoch": 0.46712875446334806,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004983451937144723,
|
|
"loss": 5.7005,
|
|
"mean_token_accuracy": 0.14591586142778395,
|
|
"num_tokens": 10255104.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"entropy": 5.659116125106811,
|
|
"epoch": 0.4675488342785129,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983415664349595,
|
|
"loss": 5.571,
|
|
"mean_token_accuracy": 0.15797292441129684,
|
|
"num_tokens": 10264236.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"entropy": 5.817941188812256,
|
|
"epoch": 0.4679689140936778,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004983379351990659,
|
|
"loss": 5.6847,
|
|
"mean_token_accuracy": 0.1529952183365822,
|
|
"num_tokens": 10273335.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"entropy": 5.723038148880005,
|
|
"epoch": 0.4683889939088427,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004983343000068559,
|
|
"loss": 5.6501,
|
|
"mean_token_accuracy": 0.1551787719130516,
|
|
"num_tokens": 10282206.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"entropy": 5.658549833297729,
|
|
"epoch": 0.46880907372400754,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004983306608583937,
|
|
"loss": 5.6064,
|
|
"mean_token_accuracy": 0.1606610283255577,
|
|
"num_tokens": 10290056.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"entropy": 5.765953302383423,
|
|
"epoch": 0.46922915353917244,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004983270177537438,
|
|
"loss": 5.688,
|
|
"mean_token_accuracy": 0.14721163958311081,
|
|
"num_tokens": 10299726.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"entropy": 5.791565895080566,
|
|
"epoch": 0.46964923335433734,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004983233706929708,
|
|
"loss": 5.7499,
|
|
"mean_token_accuracy": 0.14841315150260925,
|
|
"num_tokens": 10308696.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"entropy": 5.832704639434814,
|
|
"epoch": 0.4700693131695022,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004983197196761392,
|
|
"loss": 5.8361,
|
|
"mean_token_accuracy": 0.14296030402183532,
|
|
"num_tokens": 10317845.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"entropy": 5.783017778396607,
|
|
"epoch": 0.4704893929846671,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004983160647033139,
|
|
"loss": 5.7164,
|
|
"mean_token_accuracy": 0.1538068726658821,
|
|
"num_tokens": 10326563.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"entropy": 5.82874116897583,
|
|
"epoch": 0.470909472799832,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004983124057745595,
|
|
"loss": 5.7062,
|
|
"mean_token_accuracy": 0.14498351216316224,
|
|
"num_tokens": 10335931.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"entropy": 5.721787977218628,
|
|
"epoch": 0.47132955261499687,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004983087428899408,
|
|
"loss": 5.7048,
|
|
"mean_token_accuracy": 0.1419524259865284,
|
|
"num_tokens": 10344984.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"entropy": 5.79919285774231,
|
|
"epoch": 0.4717496324301617,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004983050760495227,
|
|
"loss": 5.7392,
|
|
"mean_token_accuracy": 0.1474568262696266,
|
|
"num_tokens": 10353522.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"entropy": 5.907325601577758,
|
|
"epoch": 0.4721697122453266,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004983014052533702,
|
|
"loss": 5.7558,
|
|
"mean_token_accuracy": 0.15104926228523255,
|
|
"num_tokens": 10363527.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"entropy": 5.760684251785278,
|
|
"epoch": 0.4725897920604915,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004982977305015481,
|
|
"loss": 5.6642,
|
|
"mean_token_accuracy": 0.14723679050803185,
|
|
"num_tokens": 10372040.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"entropy": 5.758435773849487,
|
|
"epoch": 0.47300987187565635,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004982940517941219,
|
|
"loss": 5.6546,
|
|
"mean_token_accuracy": 0.14704783335328103,
|
|
"num_tokens": 10381279.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"entropy": 5.856901502609253,
|
|
"epoch": 0.47342995169082125,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982903691311564,
|
|
"loss": 5.812,
|
|
"mean_token_accuracy": 0.14307899996638299,
|
|
"num_tokens": 10390608.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"entropy": 5.73579969406128,
|
|
"epoch": 0.47385003150598615,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004982866825127172,
|
|
"loss": 5.6083,
|
|
"mean_token_accuracy": 0.1558460235595703,
|
|
"num_tokens": 10399851.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"entropy": 5.866163825988769,
|
|
"epoch": 0.47427011132115104,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004982829919388692,
|
|
"loss": 5.9015,
|
|
"mean_token_accuracy": 0.1417351670563221,
|
|
"num_tokens": 10410425.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"entropy": 5.8129795551300045,
|
|
"epoch": 0.4746901911363159,
|
|
"grad_norm": 1.2265625,
|
|
"learning_rate": 0.0004982792974096781,
|
|
"loss": 5.6731,
|
|
"mean_token_accuracy": 0.15132275298237802,
|
|
"num_tokens": 10418783.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"entropy": 5.84749026298523,
|
|
"epoch": 0.4751102709514808,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000498275598925209,
|
|
"loss": 5.8182,
|
|
"mean_token_accuracy": 0.14057652279734612,
|
|
"num_tokens": 10427360.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"entropy": 5.890136814117431,
|
|
"epoch": 0.4755303507666457,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004982718964855277,
|
|
"loss": 5.7924,
|
|
"mean_token_accuracy": 0.1445530578494072,
|
|
"num_tokens": 10436613.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"entropy": 5.866897964477539,
|
|
"epoch": 0.4759504305818105,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004982681900907,
|
|
"loss": 5.824,
|
|
"mean_token_accuracy": 0.14708172976970674,
|
|
"num_tokens": 10445055.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"entropy": 5.747807741165161,
|
|
"epoch": 0.4763705103969754,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.000498264479740791,
|
|
"loss": 5.6512,
|
|
"mean_token_accuracy": 0.15550965517759324,
|
|
"num_tokens": 10454516.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"entropy": 5.870613145828247,
|
|
"epoch": 0.4767905902121403,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004982607654358668,
|
|
"loss": 5.7869,
|
|
"mean_token_accuracy": 0.15260702520608901,
|
|
"num_tokens": 10463771.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"entropy": 5.8192380428314205,
|
|
"epoch": 0.47721067002730516,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000498257047175993,
|
|
"loss": 5.7218,
|
|
"mean_token_accuracy": 0.1428370013833046,
|
|
"num_tokens": 10473783.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"entropy": 5.814655399322509,
|
|
"epoch": 0.47763074984247006,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004982533249612357,
|
|
"loss": 5.7006,
|
|
"mean_token_accuracy": 0.14914624020457268,
|
|
"num_tokens": 10483424.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"entropy": 5.733596324920654,
|
|
"epoch": 0.47805082965763496,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004982495987916607,
|
|
"loss": 5.6225,
|
|
"mean_token_accuracy": 0.15583412498235702,
|
|
"num_tokens": 10492536.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"entropy": 5.762752342224121,
|
|
"epoch": 0.47847090947279985,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004982458686673339,
|
|
"loss": 5.7437,
|
|
"mean_token_accuracy": 0.14716439545154572,
|
|
"num_tokens": 10501616.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"entropy": 5.899823999404907,
|
|
"epoch": 0.4788909892879647,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982421345883217,
|
|
"loss": 5.7689,
|
|
"mean_token_accuracy": 0.14301676750183107,
|
|
"num_tokens": 10511190.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"entropy": 5.779811668395996,
|
|
"epoch": 0.4793110691031296,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004982383965546898,
|
|
"loss": 5.7065,
|
|
"mean_token_accuracy": 0.14420118555426598,
|
|
"num_tokens": 10520310.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"entropy": 5.831453895568847,
|
|
"epoch": 0.4797311489182945,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004982346545665048,
|
|
"loss": 5.6901,
|
|
"mean_token_accuracy": 0.15481160432100297,
|
|
"num_tokens": 10528711.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"entropy": 5.8021493434906,
|
|
"epoch": 0.48015122873345933,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982309086238328,
|
|
"loss": 5.784,
|
|
"mean_token_accuracy": 0.1420671336352825,
|
|
"num_tokens": 10538484.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"entropy": 5.836937665939331,
|
|
"epoch": 0.48057130854862423,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004982271587267403,
|
|
"loss": 5.7229,
|
|
"mean_token_accuracy": 0.1457110583782196,
|
|
"num_tokens": 10547623.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"entropy": 5.850609970092774,
|
|
"epoch": 0.48099138836378913,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004982234048752935,
|
|
"loss": 5.6716,
|
|
"mean_token_accuracy": 0.14635915756225587,
|
|
"num_tokens": 10556234.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"entropy": 5.8946315288543705,
|
|
"epoch": 0.481411468178954,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.000498219647069559,
|
|
"loss": 5.8951,
|
|
"mean_token_accuracy": 0.13762107565999032,
|
|
"num_tokens": 10566308.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"entropy": 5.863095760345459,
|
|
"epoch": 0.48183154799411887,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004982158853096035,
|
|
"loss": 5.8416,
|
|
"mean_token_accuracy": 0.13780406042933463,
|
|
"num_tokens": 10575212.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"entropy": 5.846006250381469,
|
|
"epoch": 0.48225162780928377,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004982121195954935,
|
|
"loss": 5.6097,
|
|
"mean_token_accuracy": 0.1565147191286087,
|
|
"num_tokens": 10584590.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"entropy": 5.7412327289581295,
|
|
"epoch": 0.48267170762444866,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004982083499272957,
|
|
"loss": 5.67,
|
|
"mean_token_accuracy": 0.14907400235533713,
|
|
"num_tokens": 10593997.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"entropy": 5.807267189025879,
|
|
"epoch": 0.4830917874396135,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004982045763050768,
|
|
"loss": 5.8041,
|
|
"mean_token_accuracy": 0.1457974396646023,
|
|
"num_tokens": 10603299.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"entropy": 5.819139719009399,
|
|
"epoch": 0.4835118672547784,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004982007987289041,
|
|
"loss": 5.7361,
|
|
"mean_token_accuracy": 0.14838966429233552,
|
|
"num_tokens": 10613546.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"entropy": 5.766327381134033,
|
|
"epoch": 0.4839319470699433,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004981970171988439,
|
|
"loss": 5.7119,
|
|
"mean_token_accuracy": 0.15573283806443214,
|
|
"num_tokens": 10622966.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"entropy": 5.8138025283813475,
|
|
"epoch": 0.48435202688510814,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004981932317149636,
|
|
"loss": 5.7886,
|
|
"mean_token_accuracy": 0.14435389563441275,
|
|
"num_tokens": 10633441.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"entropy": 5.843524694442749,
|
|
"epoch": 0.48477210670027304,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00049818944227733,
|
|
"loss": 5.7587,
|
|
"mean_token_accuracy": 0.14196690768003464,
|
|
"num_tokens": 10643124.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"entropy": 5.8232954978942875,
|
|
"epoch": 0.48519218651543794,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004981856488860105,
|
|
"loss": 5.7342,
|
|
"mean_token_accuracy": 0.14407299906015397,
|
|
"num_tokens": 10652517.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"entropy": 5.807135486602784,
|
|
"epoch": 0.48561226633060284,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004981818515410721,
|
|
"loss": 5.7752,
|
|
"mean_token_accuracy": 0.1420580416917801,
|
|
"num_tokens": 10663352.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"entropy": 5.849091625213623,
|
|
"epoch": 0.4860323461457677,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004981780502425821,
|
|
"loss": 5.7942,
|
|
"mean_token_accuracy": 0.14775322377681732,
|
|
"num_tokens": 10672430.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"entropy": 5.835392618179322,
|
|
"epoch": 0.4864524259609326,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004981742449906079,
|
|
"loss": 5.7391,
|
|
"mean_token_accuracy": 0.1527121603488922,
|
|
"num_tokens": 10681908.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"entropy": 5.863359975814819,
|
|
"epoch": 0.4868725057760975,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004981704357852168,
|
|
"loss": 5.7322,
|
|
"mean_token_accuracy": 0.14731571227312087,
|
|
"num_tokens": 10691259.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"entropy": 5.765371608734131,
|
|
"epoch": 0.4872925855912623,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004981666226264764,
|
|
"loss": 5.6277,
|
|
"mean_token_accuracy": 0.14454422146081924,
|
|
"num_tokens": 10699668.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"entropy": 5.800967359542847,
|
|
"epoch": 0.4877126654064272,
|
|
"grad_norm": 1.125,
|
|
"learning_rate": 0.0004981628055144542,
|
|
"loss": 5.6801,
|
|
"mean_token_accuracy": 0.15341386646032334,
|
|
"num_tokens": 10709146.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"entropy": 5.810514068603515,
|
|
"epoch": 0.4881327452215921,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004981589844492177,
|
|
"loss": 5.7695,
|
|
"mean_token_accuracy": 0.14522838592529297,
|
|
"num_tokens": 10718724.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"entropy": 5.761872911453247,
|
|
"epoch": 0.488552825036757,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004981551594308349,
|
|
"loss": 5.7216,
|
|
"mean_token_accuracy": 0.147171813249588,
|
|
"num_tokens": 10728101.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"entropy": 5.862204074859619,
|
|
"epoch": 0.48897290485192185,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004981513304593733,
|
|
"loss": 5.7442,
|
|
"mean_token_accuracy": 0.15811584144830704,
|
|
"num_tokens": 10736750.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"entropy": 5.867376232147217,
|
|
"epoch": 0.48939298466708675,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004981474975349006,
|
|
"loss": 5.9234,
|
|
"mean_token_accuracy": 0.1416974514722824,
|
|
"num_tokens": 10746914.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"entropy": 5.859874296188354,
|
|
"epoch": 0.48981306448225165,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000498143660657485,
|
|
"loss": 5.7529,
|
|
"mean_token_accuracy": 0.1483662724494934,
|
|
"num_tokens": 10755786.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"entropy": 5.708901691436767,
|
|
"epoch": 0.4902331442974165,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004981398198271944,
|
|
"loss": 5.62,
|
|
"mean_token_accuracy": 0.15774477571249007,
|
|
"num_tokens": 10764821.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"entropy": 5.774704790115356,
|
|
"epoch": 0.4906532241125814,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004981359750440968,
|
|
"loss": 5.7095,
|
|
"mean_token_accuracy": 0.1461211383342743,
|
|
"num_tokens": 10773569.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"entropy": 5.725243186950683,
|
|
"epoch": 0.4910733039277463,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004981321263082603,
|
|
"loss": 5.6856,
|
|
"mean_token_accuracy": 0.1431654214859009,
|
|
"num_tokens": 10782298.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"entropy": 5.738628387451172,
|
|
"epoch": 0.4914933837429111,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000498128273619753,
|
|
"loss": 5.6828,
|
|
"mean_token_accuracy": 0.14975926280021667,
|
|
"num_tokens": 10792087.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"entropy": 5.819939708709716,
|
|
"epoch": 0.491913463558076,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004981244169786433,
|
|
"loss": 5.7701,
|
|
"mean_token_accuracy": 0.14303557947278023,
|
|
"num_tokens": 10801641.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"entropy": 5.9354065418243405,
|
|
"epoch": 0.4923335433732409,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004981205563849994,
|
|
"loss": 5.8349,
|
|
"mean_token_accuracy": 0.14316534698009492,
|
|
"num_tokens": 10811612.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"entropy": 5.813403034210205,
|
|
"epoch": 0.4927536231884058,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004981166918388897,
|
|
"loss": 5.6588,
|
|
"mean_token_accuracy": 0.15225751399993898,
|
|
"num_tokens": 10821608.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"entropy": 5.730116319656372,
|
|
"epoch": 0.49317370300357066,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004981128233403828,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.15562243461608888,
|
|
"num_tokens": 10830679.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"entropy": 5.774923992156983,
|
|
"epoch": 0.49359378281873556,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.000498108950889547,
|
|
"loss": 5.6988,
|
|
"mean_token_accuracy": 0.14943243488669394,
|
|
"num_tokens": 10839669.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"entropy": 5.790045118331909,
|
|
"epoch": 0.49401386263390046,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004981050744864512,
|
|
"loss": 5.6749,
|
|
"mean_token_accuracy": 0.14829235821962355,
|
|
"num_tokens": 10849666.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"entropy": 5.747752380371094,
|
|
"epoch": 0.4944339424490653,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004981011941311638,
|
|
"loss": 5.5867,
|
|
"mean_token_accuracy": 0.15378802865743638,
|
|
"num_tokens": 10858225.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"entropy": 5.754449367523193,
|
|
"epoch": 0.4948540222642302,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004980973098237535,
|
|
"loss": 5.6944,
|
|
"mean_token_accuracy": 0.14568711966276168,
|
|
"num_tokens": 10867466.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"entropy": 5.807771825790406,
|
|
"epoch": 0.4952741020793951,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004980934215642894,
|
|
"loss": 5.734,
|
|
"mean_token_accuracy": 0.15061958581209184,
|
|
"num_tokens": 10875850.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"entropy": 5.799838733673096,
|
|
"epoch": 0.49569418189456,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.00049808952935284,
|
|
"loss": 5.6623,
|
|
"mean_token_accuracy": 0.15344742313027382,
|
|
"num_tokens": 10885154.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"entropy": 5.781932210922241,
|
|
"epoch": 0.49611426170972484,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004980856331894747,
|
|
"loss": 5.7613,
|
|
"mean_token_accuracy": 0.1433933824300766,
|
|
"num_tokens": 10894080.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"entropy": 5.767203950881958,
|
|
"epoch": 0.49653434152488973,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004980817330742621,
|
|
"loss": 5.7532,
|
|
"mean_token_accuracy": 0.14163193702697754,
|
|
"num_tokens": 10903248.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"entropy": 5.848739862442017,
|
|
"epoch": 0.49695442134005463,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004980778290072716,
|
|
"loss": 5.7241,
|
|
"mean_token_accuracy": 0.14693543910980225,
|
|
"num_tokens": 10912939.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"entropy": 5.852625370025635,
|
|
"epoch": 0.4973745011552195,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004980739209885722,
|
|
"loss": 5.7214,
|
|
"mean_token_accuracy": 0.15410863906145095,
|
|
"num_tokens": 10921505.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"entropy": 5.887701892852784,
|
|
"epoch": 0.49779458097038437,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004980700090182331,
|
|
"loss": 5.8102,
|
|
"mean_token_accuracy": 0.15374419540166856,
|
|
"num_tokens": 10931861.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"entropy": 5.864461851119995,
|
|
"epoch": 0.49821466078554927,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004980660930963238,
|
|
"loss": 5.7275,
|
|
"mean_token_accuracy": 0.14706841260194778,
|
|
"num_tokens": 10940810.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"entropy": 5.740756177902222,
|
|
"epoch": 0.4986347406007141,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004980621732229133,
|
|
"loss": 5.6064,
|
|
"mean_token_accuracy": 0.15016069263219833,
|
|
"num_tokens": 10949514.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"entropy": 5.8102155208587645,
|
|
"epoch": 0.499054820415879,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004980582493980714,
|
|
"loss": 5.8295,
|
|
"mean_token_accuracy": 0.14151460081338882,
|
|
"num_tokens": 10959161.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"entropy": 5.781671619415283,
|
|
"epoch": 0.4994749002310439,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004980543216218674,
|
|
"loss": 5.6979,
|
|
"mean_token_accuracy": 0.15644685178995132,
|
|
"num_tokens": 10968983.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"entropy": 5.819313478469849,
|
|
"epoch": 0.4998949800462088,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004980503898943711,
|
|
"loss": 5.8145,
|
|
"mean_token_accuracy": 0.14870925694704057,
|
|
"num_tokens": 10978044.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"entropy": 5.8994536876678465,
|
|
"epoch": 0.5003150598613737,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004980464542156519,
|
|
"loss": 5.7276,
|
|
"mean_token_accuracy": 0.15195106863975524,
|
|
"num_tokens": 10986980.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"entropy": 5.804496479034424,
|
|
"epoch": 0.5007351396765385,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004980425145857796,
|
|
"loss": 5.6674,
|
|
"mean_token_accuracy": 0.16026414483785628,
|
|
"num_tokens": 10995163.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"entropy": 5.705014753341675,
|
|
"epoch": 0.5011552194917034,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.000498038571004824,
|
|
"loss": 5.6084,
|
|
"mean_token_accuracy": 0.15652424693107606,
|
|
"num_tokens": 11003722.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"entropy": 5.725177145004272,
|
|
"epoch": 0.5015752993068683,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004980346234728549,
|
|
"loss": 5.6641,
|
|
"mean_token_accuracy": 0.1604843556880951,
|
|
"num_tokens": 11013176.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"entropy": 5.815953350067138,
|
|
"epoch": 0.5019953791220332,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004980306719899424,
|
|
"loss": 5.7337,
|
|
"mean_token_accuracy": 0.14801866561174393,
|
|
"num_tokens": 11022636.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"entropy": 5.758071231842041,
|
|
"epoch": 0.5024154589371981,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004980267165561564,
|
|
"loss": 5.6897,
|
|
"mean_token_accuracy": 0.1525172308087349,
|
|
"num_tokens": 11031896.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"entropy": 5.801525115966797,
|
|
"epoch": 0.502835538752363,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004980227571715669,
|
|
"loss": 5.7359,
|
|
"mean_token_accuracy": 0.14860138446092605,
|
|
"num_tokens": 11040802.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"entropy": 5.774550342559815,
|
|
"epoch": 0.5032556185675279,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004980187938362441,
|
|
"loss": 5.6483,
|
|
"mean_token_accuracy": 0.14492484778165818,
|
|
"num_tokens": 11049701.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"entropy": 5.841704797744751,
|
|
"epoch": 0.5036756983826927,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004980148265502581,
|
|
"loss": 5.831,
|
|
"mean_token_accuracy": 0.14276057258248329,
|
|
"num_tokens": 11059555.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"entropy": 5.823680114746094,
|
|
"epoch": 0.5040957781978576,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004980108553136795,
|
|
"loss": 5.7444,
|
|
"mean_token_accuracy": 0.14943215250968933,
|
|
"num_tokens": 11068940.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 0.5040957781978576,
|
|
"eval_entropy": 5.697147493315468,
|
|
"eval_loss": 5.740706920623779,
|
|
"eval_mean_token_accuracy": 0.15491213997795275,
|
|
"eval_num_tokens": 11068940.0,
|
|
"eval_runtime": 27.2377,
|
|
"eval_samples_per_second": 1371.85,
|
|
"eval_steps_per_second": 171.49,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"entropy": 5.881861591339112,
|
|
"epoch": 0.5045158580130225,
|
|
"grad_norm": 1.1328125,
|
|
"learning_rate": 0.0004980068801265783,
|
|
"loss": 5.727,
|
|
"mean_token_accuracy": 0.14888829812407495,
|
|
"num_tokens": 11079014.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"entropy": 5.835190010070801,
|
|
"epoch": 0.5049359378281874,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004980029009890251,
|
|
"loss": 5.8148,
|
|
"mean_token_accuracy": 0.14982240945100783,
|
|
"num_tokens": 11089526.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"entropy": 5.815138959884644,
|
|
"epoch": 0.5053560176433523,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004979989179010904,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.15157256349921228,
|
|
"num_tokens": 11099156.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"entropy": 5.769509267807007,
|
|
"epoch": 0.5057760974585171,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004979949308628445,
|
|
"loss": 5.6704,
|
|
"mean_token_accuracy": 0.1513045035302639,
|
|
"num_tokens": 11108242.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"entropy": 5.701773405075073,
|
|
"epoch": 0.506196177273682,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979909398743584,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.15321220606565475,
|
|
"num_tokens": 11118076.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"entropy": 5.794755840301514,
|
|
"epoch": 0.5066162570888468,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004979869449357026,
|
|
"loss": 5.7614,
|
|
"mean_token_accuracy": 0.15703760236501693,
|
|
"num_tokens": 11127265.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"entropy": 5.807633590698242,
|
|
"epoch": 0.5070363369040117,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979829460469478,
|
|
"loss": 5.6736,
|
|
"mean_token_accuracy": 0.15248569697141648,
|
|
"num_tokens": 11136429.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"entropy": 5.786873388290405,
|
|
"epoch": 0.5074564167191766,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004979789432081649,
|
|
"loss": 5.6848,
|
|
"mean_token_accuracy": 0.14916185587644576,
|
|
"num_tokens": 11146201.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"entropy": 5.832711553573608,
|
|
"epoch": 0.5078764965343415,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000497974936419425,
|
|
"loss": 5.7005,
|
|
"mean_token_accuracy": 0.15159857869148255,
|
|
"num_tokens": 11154867.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"entropy": 5.749881458282471,
|
|
"epoch": 0.5082965763495064,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004979709256807989,
|
|
"loss": 5.7373,
|
|
"mean_token_accuracy": 0.15010453313589095,
|
|
"num_tokens": 11164092.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"entropy": 5.8348558902740475,
|
|
"epoch": 0.5087166561646713,
|
|
"grad_norm": 1.1953125,
|
|
"learning_rate": 0.0004979669109923575,
|
|
"loss": 5.7561,
|
|
"mean_token_accuracy": 0.14959800839424134,
|
|
"num_tokens": 11173176.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"entropy": 5.888238859176636,
|
|
"epoch": 0.5091367359798362,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004979628923541721,
|
|
"loss": 5.7312,
|
|
"mean_token_accuracy": 0.15100948065519332,
|
|
"num_tokens": 11182397.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"entropy": 5.828408145904541,
|
|
"epoch": 0.509556815795001,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.000497958869766314,
|
|
"loss": 5.7777,
|
|
"mean_token_accuracy": 0.1484249010682106,
|
|
"num_tokens": 11191790.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"entropy": 5.782152128219605,
|
|
"epoch": 0.5099768956101659,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004979548432288543,
|
|
"loss": 5.6808,
|
|
"mean_token_accuracy": 0.15421139895915986,
|
|
"num_tokens": 11201104.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"entropy": 5.781926536560059,
|
|
"epoch": 0.5103969754253308,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004979508127418643,
|
|
"loss": 5.6751,
|
|
"mean_token_accuracy": 0.1530236378312111,
|
|
"num_tokens": 11209578.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"entropy": 5.7934770584106445,
|
|
"epoch": 0.5108170552404957,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004979467783054155,
|
|
"loss": 5.6411,
|
|
"mean_token_accuracy": 0.15649499446153642,
|
|
"num_tokens": 11218380.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"entropy": 5.741991376876831,
|
|
"epoch": 0.5112371350556606,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004979427399195793,
|
|
"loss": 5.6468,
|
|
"mean_token_accuracy": 0.15304491966962813,
|
|
"num_tokens": 11227810.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"entropy": 5.781096887588501,
|
|
"epoch": 0.5116572148708255,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004979386975844274,
|
|
"loss": 5.6714,
|
|
"mean_token_accuracy": 0.14935449212789537,
|
|
"num_tokens": 11236631.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"entropy": 5.738012218475342,
|
|
"epoch": 0.5120772946859904,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004979346513000311,
|
|
"loss": 5.7303,
|
|
"mean_token_accuracy": 0.14597853422164916,
|
|
"num_tokens": 11247418.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"entropy": 5.700581455230713,
|
|
"epoch": 0.5124973745011552,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004979306010664623,
|
|
"loss": 5.6292,
|
|
"mean_token_accuracy": 0.1570621207356453,
|
|
"num_tokens": 11256246.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"entropy": 5.715303707122803,
|
|
"epoch": 0.5129174543163201,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004979265468837927,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.153353688120842,
|
|
"num_tokens": 11265980.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"entropy": 5.795161724090576,
|
|
"epoch": 0.513337534131485,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.000497922488752094,
|
|
"loss": 5.6695,
|
|
"mean_token_accuracy": 0.15220673233270646,
|
|
"num_tokens": 11276158.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"entropy": 5.751807403564453,
|
|
"epoch": 0.5137576139466499,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004979184266714383,
|
|
"loss": 5.5778,
|
|
"mean_token_accuracy": 0.15824360698461531,
|
|
"num_tokens": 11284957.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"entropy": 5.663950729370117,
|
|
"epoch": 0.5141776937618148,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004979143606418974,
|
|
"loss": 5.5969,
|
|
"mean_token_accuracy": 0.15413714349269866,
|
|
"num_tokens": 11294340.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"entropy": 5.871919870376587,
|
|
"epoch": 0.5145977735769797,
|
|
"grad_norm": 1.15625,
|
|
"learning_rate": 0.0004979102906635435,
|
|
"loss": 5.8581,
|
|
"mean_token_accuracy": 0.1427499048411846,
|
|
"num_tokens": 11303344.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"entropy": 5.876821136474609,
|
|
"epoch": 0.5150178533921445,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004979062167364486,
|
|
"loss": 5.718,
|
|
"mean_token_accuracy": 0.15818116441369057,
|
|
"num_tokens": 11311338.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"entropy": 5.703396987915039,
|
|
"epoch": 0.5154379332073094,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004979021388606847,
|
|
"loss": 5.5589,
|
|
"mean_token_accuracy": 0.16222982257604598,
|
|
"num_tokens": 11320194.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"entropy": 5.746637916564941,
|
|
"epoch": 0.5158580130224742,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004978980570363243,
|
|
"loss": 5.7316,
|
|
"mean_token_accuracy": 0.1521230459213257,
|
|
"num_tokens": 11329952.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"entropy": 5.715192699432373,
|
|
"epoch": 0.5162780928376391,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978939712634396,
|
|
"loss": 5.6894,
|
|
"mean_token_accuracy": 0.1550623059272766,
|
|
"num_tokens": 11339384.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"entropy": 5.8614537715911865,
|
|
"epoch": 0.516698172652804,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004978898815421029,
|
|
"loss": 5.8578,
|
|
"mean_token_accuracy": 0.14535280168056489,
|
|
"num_tokens": 11348409.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"entropy": 5.916036462783813,
|
|
"epoch": 0.5171182524679689,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004978857878723867,
|
|
"loss": 5.7757,
|
|
"mean_token_accuracy": 0.1502631425857544,
|
|
"num_tokens": 11357478.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"entropy": 5.791261196136475,
|
|
"epoch": 0.5175383322831338,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004978816902543636,
|
|
"loss": 5.7553,
|
|
"mean_token_accuracy": 0.15079374462366105,
|
|
"num_tokens": 11366379.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"entropy": 5.864686632156372,
|
|
"epoch": 0.5179584120982986,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978775886881062,
|
|
"loss": 5.7858,
|
|
"mean_token_accuracy": 0.14658837914466857,
|
|
"num_tokens": 11376357.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"entropy": 5.7728368759155275,
|
|
"epoch": 0.5183784919134635,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000497873483173687,
|
|
"loss": 5.6671,
|
|
"mean_token_accuracy": 0.1539960816502571,
|
|
"num_tokens": 11384995.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"entropy": 5.737247657775879,
|
|
"epoch": 0.5187985717286284,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004978693737111787,
|
|
"loss": 5.6705,
|
|
"mean_token_accuracy": 0.14912281930446625,
|
|
"num_tokens": 11395363.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"entropy": 5.844261121749878,
|
|
"epoch": 0.5192186515437933,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004978652603006543,
|
|
"loss": 5.6595,
|
|
"mean_token_accuracy": 0.14817970544099807,
|
|
"num_tokens": 11404511.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"entropy": 5.792019557952881,
|
|
"epoch": 0.5196387313589582,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004978611429421866,
|
|
"loss": 5.706,
|
|
"mean_token_accuracy": 0.14902769923210143,
|
|
"num_tokens": 11413400.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"entropy": 5.796417903900147,
|
|
"epoch": 0.5200588111741231,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004978570216358485,
|
|
"loss": 5.7593,
|
|
"mean_token_accuracy": 0.14429776668548583,
|
|
"num_tokens": 11423693.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"entropy": 5.864574718475342,
|
|
"epoch": 0.520478890989288,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000497852896381713,
|
|
"loss": 5.7084,
|
|
"mean_token_accuracy": 0.14549012184143068,
|
|
"num_tokens": 11433195.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"entropy": 5.842799043655395,
|
|
"epoch": 0.5208989708044528,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004978487671798531,
|
|
"loss": 5.8297,
|
|
"mean_token_accuracy": 0.13647647053003312,
|
|
"num_tokens": 11443416.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"entropy": 5.879248857498169,
|
|
"epoch": 0.5213190506196177,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978446340303422,
|
|
"loss": 5.7058,
|
|
"mean_token_accuracy": 0.1528090812265873,
|
|
"num_tokens": 11452487.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"entropy": 5.78897385597229,
|
|
"epoch": 0.5217391304347826,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004978404969332533,
|
|
"loss": 5.7399,
|
|
"mean_token_accuracy": 0.1569316253066063,
|
|
"num_tokens": 11461893.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"entropy": 5.697704410552978,
|
|
"epoch": 0.5221592102499475,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004978363558886597,
|
|
"loss": 5.6464,
|
|
"mean_token_accuracy": 0.1460499122738838,
|
|
"num_tokens": 11471238.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"entropy": 5.79966549873352,
|
|
"epoch": 0.5225792900651124,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004978322108966348,
|
|
"loss": 5.752,
|
|
"mean_token_accuracy": 0.14630683958530427,
|
|
"num_tokens": 11480571.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"entropy": 5.799462223052979,
|
|
"epoch": 0.5229993698802773,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004978280619572521,
|
|
"loss": 5.7488,
|
|
"mean_token_accuracy": 0.14263924062252045,
|
|
"num_tokens": 11489552.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"entropy": 5.8670618534088135,
|
|
"epoch": 0.5234194496954422,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000497823909070585,
|
|
"loss": 5.7876,
|
|
"mean_token_accuracy": 0.14341011941432952,
|
|
"num_tokens": 11498715.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"entropy": 5.83455605506897,
|
|
"epoch": 0.523839529510607,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004978197522367071,
|
|
"loss": 5.733,
|
|
"mean_token_accuracy": 0.1471528485417366,
|
|
"num_tokens": 11508472.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"entropy": 5.869316053390503,
|
|
"epoch": 0.5242596093257719,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004978155914556919,
|
|
"loss": 5.6768,
|
|
"mean_token_accuracy": 0.15946109294891359,
|
|
"num_tokens": 11517620.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"entropy": 5.756000518798828,
|
|
"epoch": 0.5246796891409368,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004978114267276134,
|
|
"loss": 5.7077,
|
|
"mean_token_accuracy": 0.14685916304588317,
|
|
"num_tokens": 11526106.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"entropy": 5.81321964263916,
|
|
"epoch": 0.5250997689561017,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004978072580525451,
|
|
"loss": 5.7369,
|
|
"mean_token_accuracy": 0.15604251325130464,
|
|
"num_tokens": 11535840.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"entropy": 5.809660863876343,
|
|
"epoch": 0.5255198487712666,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000497803085430561,
|
|
"loss": 5.7363,
|
|
"mean_token_accuracy": 0.14954072907567023,
|
|
"num_tokens": 11545110.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"entropy": 5.943748712539673,
|
|
"epoch": 0.5259399285864315,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004977989088617349,
|
|
"loss": 5.7661,
|
|
"mean_token_accuracy": 0.14624925330281258,
|
|
"num_tokens": 11554382.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"entropy": 5.762158155441284,
|
|
"epoch": 0.5263600084015964,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000497794728346141,
|
|
"loss": 5.6224,
|
|
"mean_token_accuracy": 0.158403742313385,
|
|
"num_tokens": 11562821.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"entropy": 5.863851022720337,
|
|
"epoch": 0.5267800882167611,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004977905438838531,
|
|
"loss": 5.8503,
|
|
"mean_token_accuracy": 0.14122226759791373,
|
|
"num_tokens": 11571705.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"entropy": 5.719424200057984,
|
|
"epoch": 0.527200168031926,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004977863554749453,
|
|
"loss": 5.6546,
|
|
"mean_token_accuracy": 0.14733370542526245,
|
|
"num_tokens": 11580692.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"entropy": 5.734767818450928,
|
|
"epoch": 0.5276202478470909,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004977821631194922,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.14979589208960534,
|
|
"num_tokens": 11589966.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"entropy": 5.853599739074707,
|
|
"epoch": 0.5280403276622558,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004977779668175677,
|
|
"loss": 5.7372,
|
|
"mean_token_accuracy": 0.147355617582798,
|
|
"num_tokens": 11599627.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"entropy": 5.828013181686401,
|
|
"epoch": 0.5284604074774207,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004977737665692461,
|
|
"loss": 5.7248,
|
|
"mean_token_accuracy": 0.15135419219732285,
|
|
"num_tokens": 11608431.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"entropy": 5.765194272994995,
|
|
"epoch": 0.5288804872925856,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004977695623746021,
|
|
"loss": 5.6002,
|
|
"mean_token_accuracy": 0.1498238652944565,
|
|
"num_tokens": 11617552.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"entropy": 5.738640642166137,
|
|
"epoch": 0.5293005671077504,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004977653542337099,
|
|
"loss": 5.6352,
|
|
"mean_token_accuracy": 0.15803537368774415,
|
|
"num_tokens": 11626828.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"entropy": 5.746969509124756,
|
|
"epoch": 0.5297206469229153,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004977611421466443,
|
|
"loss": 5.7259,
|
|
"mean_token_accuracy": 0.14914170354604722,
|
|
"num_tokens": 11635867.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"entropy": 5.846399927139283,
|
|
"epoch": 0.5301407267380802,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004977569261134797,
|
|
"loss": 5.6433,
|
|
"mean_token_accuracy": 0.1556470736861229,
|
|
"num_tokens": 11644711.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"entropy": 5.829954338073731,
|
|
"epoch": 0.5305608065532451,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004977527061342908,
|
|
"loss": 5.7155,
|
|
"mean_token_accuracy": 0.15602043867111207,
|
|
"num_tokens": 11653320.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"entropy": 5.789897012710571,
|
|
"epoch": 0.53098088636841,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004977484822091524,
|
|
"loss": 5.6774,
|
|
"mean_token_accuracy": 0.15594944804906846,
|
|
"num_tokens": 11662753.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"entropy": 5.8104596614837645,
|
|
"epoch": 0.5314009661835749,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004977442543381394,
|
|
"loss": 5.7164,
|
|
"mean_token_accuracy": 0.14611099809408187,
|
|
"num_tokens": 11671622.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"entropy": 5.828975057601928,
|
|
"epoch": 0.5318210459987398,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004977400225213266,
|
|
"loss": 5.7045,
|
|
"mean_token_accuracy": 0.14795105382800103,
|
|
"num_tokens": 11679964.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"entropy": 5.745060634613037,
|
|
"epoch": 0.5322411258139046,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000497735786758789,
|
|
"loss": 5.6717,
|
|
"mean_token_accuracy": 0.15165670067071915,
|
|
"num_tokens": 11688700.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"entropy": 5.83441858291626,
|
|
"epoch": 0.5326612056290695,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004977315470506016,
|
|
"loss": 5.7835,
|
|
"mean_token_accuracy": 0.14904531762003898,
|
|
"num_tokens": 11698425.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"entropy": 5.9017280578613285,
|
|
"epoch": 0.5330812854442344,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004977273033968397,
|
|
"loss": 5.7618,
|
|
"mean_token_accuracy": 0.14042024686932564,
|
|
"num_tokens": 11707705.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"entropy": 5.7629967212677,
|
|
"epoch": 0.5335013652593993,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004977230557975782,
|
|
"loss": 5.6748,
|
|
"mean_token_accuracy": 0.15279789865016938,
|
|
"num_tokens": 11717079.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"entropy": 5.769342231750488,
|
|
"epoch": 0.5339214450745642,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004977188042528923,
|
|
"loss": 5.6452,
|
|
"mean_token_accuracy": 0.15056213662028312,
|
|
"num_tokens": 11725504.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"entropy": 5.820083951950073,
|
|
"epoch": 0.5343415248897291,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004977145487628576,
|
|
"loss": 5.7328,
|
|
"mean_token_accuracy": 0.14796183705329896,
|
|
"num_tokens": 11735282.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"entropy": 5.833522939682007,
|
|
"epoch": 0.534761604704894,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004977102893275494,
|
|
"loss": 5.7212,
|
|
"mean_token_accuracy": 0.1473264567553997,
|
|
"num_tokens": 11744827.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"entropy": 5.850900793075562,
|
|
"epoch": 0.5351816845200588,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.000497706025947043,
|
|
"loss": 5.6722,
|
|
"mean_token_accuracy": 0.15216659232974053,
|
|
"num_tokens": 11753066.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"entropy": 5.759478998184204,
|
|
"epoch": 0.5356017643352237,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004977017586214142,
|
|
"loss": 5.6991,
|
|
"mean_token_accuracy": 0.1468222975730896,
|
|
"num_tokens": 11761190.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"entropy": 5.775899982452392,
|
|
"epoch": 0.5360218441503886,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004976974873507382,
|
|
"loss": 5.6625,
|
|
"mean_token_accuracy": 0.15434601306915283,
|
|
"num_tokens": 11770321.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"entropy": 5.77639217376709,
|
|
"epoch": 0.5364419239655535,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.000497693212135091,
|
|
"loss": 5.7275,
|
|
"mean_token_accuracy": 0.1511888399720192,
|
|
"num_tokens": 11778388.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"entropy": 5.850775814056396,
|
|
"epoch": 0.5368620037807184,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004976889329745482,
|
|
"loss": 5.6031,
|
|
"mean_token_accuracy": 0.15313038155436515,
|
|
"num_tokens": 11786250.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"entropy": 5.698163795471191,
|
|
"epoch": 0.5372820835958833,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004976846498691857,
|
|
"loss": 5.5642,
|
|
"mean_token_accuracy": 0.1565365344285965,
|
|
"num_tokens": 11794831.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"entropy": 5.7417463779449465,
|
|
"epoch": 0.5377021634110482,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004976803628190792,
|
|
"loss": 5.6156,
|
|
"mean_token_accuracy": 0.15607637539505959,
|
|
"num_tokens": 11803550.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"entropy": 5.770711898803711,
|
|
"epoch": 0.5381222432262129,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004976760718243047,
|
|
"loss": 5.6882,
|
|
"mean_token_accuracy": 0.14991564601659774,
|
|
"num_tokens": 11812478.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"entropy": 5.792763185501099,
|
|
"epoch": 0.5385423230413778,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004976717768849383,
|
|
"loss": 5.66,
|
|
"mean_token_accuracy": 0.14323782697319984,
|
|
"num_tokens": 11822463.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"entropy": 5.790409803390503,
|
|
"epoch": 0.5389624028565427,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004976674780010561,
|
|
"loss": 5.6988,
|
|
"mean_token_accuracy": 0.14533982276916504,
|
|
"num_tokens": 11831853.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"entropy": 5.887389659881592,
|
|
"epoch": 0.5393824826717076,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000497663175172734,
|
|
"loss": 5.732,
|
|
"mean_token_accuracy": 0.1425001822412014,
|
|
"num_tokens": 11841574.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"entropy": 5.900919437408447,
|
|
"epoch": 0.5398025624868725,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004976588684000486,
|
|
"loss": 5.8292,
|
|
"mean_token_accuracy": 0.13499653488397598,
|
|
"num_tokens": 11852489.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"entropy": 5.774700355529785,
|
|
"epoch": 0.5402226423020374,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004976545576830759,
|
|
"loss": 5.6866,
|
|
"mean_token_accuracy": 0.14809116050601007,
|
|
"num_tokens": 11861499.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"entropy": 5.837946367263794,
|
|
"epoch": 0.5406427221172023,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004976502430218924,
|
|
"loss": 5.7586,
|
|
"mean_token_accuracy": 0.1402556501328945,
|
|
"num_tokens": 11871685.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"entropy": 5.817818880081177,
|
|
"epoch": 0.5410628019323671,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004976459244165744,
|
|
"loss": 5.6802,
|
|
"mean_token_accuracy": 0.15272180885076522,
|
|
"num_tokens": 11881340.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"entropy": 5.70283875465393,
|
|
"epoch": 0.541482881747532,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004976416018671986,
|
|
"loss": 5.6817,
|
|
"mean_token_accuracy": 0.1477773442864418,
|
|
"num_tokens": 11890700.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"entropy": 5.776293468475342,
|
|
"epoch": 0.5419029615626969,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004976372753738415,
|
|
"loss": 5.6896,
|
|
"mean_token_accuracy": 0.1432950407266617,
|
|
"num_tokens": 11900329.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"entropy": 5.955761337280274,
|
|
"epoch": 0.5423230413778618,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004976329449365795,
|
|
"loss": 5.7217,
|
|
"mean_token_accuracy": 0.14292674511671066,
|
|
"num_tokens": 11909915.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"entropy": 5.778019428253174,
|
|
"epoch": 0.5427431211930267,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004976286105554897,
|
|
"loss": 5.7279,
|
|
"mean_token_accuracy": 0.15403613746166228,
|
|
"num_tokens": 11918302.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"entropy": 5.734020948410034,
|
|
"epoch": 0.5431632010081916,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004976242722306487,
|
|
"loss": 5.6907,
|
|
"mean_token_accuracy": 0.15065288841724395,
|
|
"num_tokens": 11927794.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"entropy": 5.854168796539307,
|
|
"epoch": 0.5435832808233564,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004976199299621333,
|
|
"loss": 5.7169,
|
|
"mean_token_accuracy": 0.1502951353788376,
|
|
"num_tokens": 11937701.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"entropy": 5.70092806816101,
|
|
"epoch": 0.5440033606385213,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004976155837500205,
|
|
"loss": 5.6271,
|
|
"mean_token_accuracy": 0.15380519181489943,
|
|
"num_tokens": 11946106.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"entropy": 5.770787239074707,
|
|
"epoch": 0.5444234404536862,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004976112335943872,
|
|
"loss": 5.5681,
|
|
"mean_token_accuracy": 0.15551166981458664,
|
|
"num_tokens": 11954604.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"entropy": 5.674026393890381,
|
|
"epoch": 0.5448435202688511,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004976068794953106,
|
|
"loss": 5.623,
|
|
"mean_token_accuracy": 0.15597020387649535,
|
|
"num_tokens": 11963664.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"entropy": 5.792567873001099,
|
|
"epoch": 0.545263600084016,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004976025214528677,
|
|
"loss": 5.6413,
|
|
"mean_token_accuracy": 0.15098657310009003,
|
|
"num_tokens": 11973426.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"entropy": 5.726925039291382,
|
|
"epoch": 0.5456836798991809,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004975981594671359,
|
|
"loss": 5.6801,
|
|
"mean_token_accuracy": 0.14519474059343337,
|
|
"num_tokens": 11982339.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"entropy": 5.827198791503906,
|
|
"epoch": 0.5461037597143458,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004975937935381921,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.1518527202308178,
|
|
"num_tokens": 11992016.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"entropy": 5.757827472686768,
|
|
"epoch": 0.5465238395295106,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000497589423666114,
|
|
"loss": 5.7083,
|
|
"mean_token_accuracy": 0.14981078058481218,
|
|
"num_tokens": 12000616.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"entropy": 5.628077507019043,
|
|
"epoch": 0.5469439193446755,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004975850498509789,
|
|
"loss": 5.6115,
|
|
"mean_token_accuracy": 0.155507330596447,
|
|
"num_tokens": 12009717.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"entropy": 5.750427150726319,
|
|
"epoch": 0.5473639991598404,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004975806720928642,
|
|
"loss": 5.6897,
|
|
"mean_token_accuracy": 0.15005779415369033,
|
|
"num_tokens": 12018020.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"entropy": 5.818803167343139,
|
|
"epoch": 0.5477840789750053,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004975762903918475,
|
|
"loss": 5.6906,
|
|
"mean_token_accuracy": 0.14765678942203522,
|
|
"num_tokens": 12027119.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"entropy": 5.7886378288269045,
|
|
"epoch": 0.5482041587901701,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004975719047480064,
|
|
"loss": 5.6594,
|
|
"mean_token_accuracy": 0.1583278015255928,
|
|
"num_tokens": 12035566.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"entropy": 5.725280332565307,
|
|
"epoch": 0.548624238605335,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004975675151614187,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.1575550004839897,
|
|
"num_tokens": 12044505.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"entropy": 5.66980390548706,
|
|
"epoch": 0.5490443184204999,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.000497563121632162,
|
|
"loss": 5.6574,
|
|
"mean_token_accuracy": 0.1514866828918457,
|
|
"num_tokens": 12053338.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"entropy": 5.768905687332153,
|
|
"epoch": 0.5494643982356647,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004975587241603142,
|
|
"loss": 5.6602,
|
|
"mean_token_accuracy": 0.14860893040895462,
|
|
"num_tokens": 12063235.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"entropy": 5.842012786865235,
|
|
"epoch": 0.5498844780508296,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975543227459533,
|
|
"loss": 5.7328,
|
|
"mean_token_accuracy": 0.14544900879263878,
|
|
"num_tokens": 12072490.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"entropy": 5.826513576507568,
|
|
"epoch": 0.5503045578659945,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004975499173891571,
|
|
"loss": 5.7879,
|
|
"mean_token_accuracy": 0.14479437321424485,
|
|
"num_tokens": 12081474.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"entropy": 5.7784984588623045,
|
|
"epoch": 0.5507246376811594,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004975455080900037,
|
|
"loss": 5.643,
|
|
"mean_token_accuracy": 0.15413787513971328,
|
|
"num_tokens": 12090963.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"entropy": 5.757335090637207,
|
|
"epoch": 0.5511447174963243,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004975410948485713,
|
|
"loss": 5.6689,
|
|
"mean_token_accuracy": 0.14914965778589248,
|
|
"num_tokens": 12099786.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"entropy": 5.719509267807007,
|
|
"epoch": 0.5515647973114892,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004975366776649379,
|
|
"loss": 5.6816,
|
|
"mean_token_accuracy": 0.15208746641874313,
|
|
"num_tokens": 12108469.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"entropy": 5.7668781757354735,
|
|
"epoch": 0.5519848771266541,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004975322565391818,
|
|
"loss": 5.6629,
|
|
"mean_token_accuracy": 0.15438627898693086,
|
|
"num_tokens": 12118287.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"entropy": 5.862242794036865,
|
|
"epoch": 0.5524049569418189,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004975278314713814,
|
|
"loss": 5.8088,
|
|
"mean_token_accuracy": 0.1440727099776268,
|
|
"num_tokens": 12127122.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"entropy": 5.806251859664917,
|
|
"epoch": 0.5528250367569838,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004975234024616152,
|
|
"loss": 5.6977,
|
|
"mean_token_accuracy": 0.15571955889463424,
|
|
"num_tokens": 12136395.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"entropy": 5.695988988876342,
|
|
"epoch": 0.5532451165721487,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004975189695099613,
|
|
"loss": 5.683,
|
|
"mean_token_accuracy": 0.1485990047454834,
|
|
"num_tokens": 12145025.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"entropy": 5.828900909423828,
|
|
"epoch": 0.5536651963873136,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004975145326164985,
|
|
"loss": 5.7345,
|
|
"mean_token_accuracy": 0.1466663308441639,
|
|
"num_tokens": 12154352.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"entropy": 5.78888168334961,
|
|
"epoch": 0.5540852762024785,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004975100917813055,
|
|
"loss": 5.6427,
|
|
"mean_token_accuracy": 0.1504230782389641,
|
|
"num_tokens": 12163802.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"entropy": 5.719293403625488,
|
|
"epoch": 0.5545053560176434,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004975056470044606,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15201830267906188,
|
|
"num_tokens": 12173111.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"entropy": 5.788504505157471,
|
|
"epoch": 0.5549254358328082,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004975011982860428,
|
|
"loss": 5.7128,
|
|
"mean_token_accuracy": 0.15041669309139252,
|
|
"num_tokens": 12182048.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"entropy": 5.795574474334717,
|
|
"epoch": 0.5553455156479731,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004974967456261309,
|
|
"loss": 5.6928,
|
|
"mean_token_accuracy": 0.1525782212615013,
|
|
"num_tokens": 12191501.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"entropy": 5.853242444992065,
|
|
"epoch": 0.555765595463138,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004974922890248036,
|
|
"loss": 5.7179,
|
|
"mean_token_accuracy": 0.15184747502207757,
|
|
"num_tokens": 12201132.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"entropy": 5.852621126174927,
|
|
"epoch": 0.5561856752783029,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.00049748782848214,
|
|
"loss": 5.8361,
|
|
"mean_token_accuracy": 0.14677973017096518,
|
|
"num_tokens": 12211082.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"entropy": 5.785615491867065,
|
|
"epoch": 0.5566057550934678,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004974833639982192,
|
|
"loss": 5.6457,
|
|
"mean_token_accuracy": 0.15385498329997063,
|
|
"num_tokens": 12219946.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"entropy": 5.848658609390259,
|
|
"epoch": 0.5570258349086327,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.00049747889557312,
|
|
"loss": 5.7623,
|
|
"mean_token_accuracy": 0.14732073992490768,
|
|
"num_tokens": 12229668.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"entropy": 5.834868860244751,
|
|
"epoch": 0.5574459147237976,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004974744232069219,
|
|
"loss": 5.756,
|
|
"mean_token_accuracy": 0.15054013729095458,
|
|
"num_tokens": 12238750.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"entropy": 5.799629974365234,
|
|
"epoch": 0.5578659945389624,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004974699468997038,
|
|
"loss": 5.6952,
|
|
"mean_token_accuracy": 0.14818740338087083,
|
|
"num_tokens": 12246825.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"entropy": 5.715137672424317,
|
|
"epoch": 0.5582860743541272,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004974654666515452,
|
|
"loss": 5.6398,
|
|
"mean_token_accuracy": 0.14599109143018724,
|
|
"num_tokens": 12256413.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"entropy": 5.772627592086792,
|
|
"epoch": 0.5587061541692921,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004974609824625254,
|
|
"loss": 5.6647,
|
|
"mean_token_accuracy": 0.15836252719163896,
|
|
"num_tokens": 12265458.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"entropy": 5.642200517654419,
|
|
"epoch": 0.559126233984457,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004974564943327239,
|
|
"loss": 5.6032,
|
|
"mean_token_accuracy": 0.1530925676226616,
|
|
"num_tokens": 12274124.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"entropy": 5.643485116958618,
|
|
"epoch": 0.5595463137996219,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.00049745200226222,
|
|
"loss": 5.5692,
|
|
"mean_token_accuracy": 0.1617315962910652,
|
|
"num_tokens": 12283513.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"entropy": 5.769548034667968,
|
|
"epoch": 0.5599663936147868,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004974475062510936,
|
|
"loss": 5.6941,
|
|
"mean_token_accuracy": 0.15532324314117432,
|
|
"num_tokens": 12292396.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"entropy": 5.82047667503357,
|
|
"epoch": 0.5603864734299517,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004974430062994242,
|
|
"loss": 5.7225,
|
|
"mean_token_accuracy": 0.15115630030632018,
|
|
"num_tokens": 12301604.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"entropy": 5.886059427261353,
|
|
"epoch": 0.5608065532451165,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004974385024072912,
|
|
"loss": 5.7476,
|
|
"mean_token_accuracy": 0.14723663330078124,
|
|
"num_tokens": 12310458.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"entropy": 5.850595569610595,
|
|
"epoch": 0.5612266330602814,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497433994574775,
|
|
"loss": 5.7541,
|
|
"mean_token_accuracy": 0.14729131162166595,
|
|
"num_tokens": 12319620.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"entropy": 5.768581199645996,
|
|
"epoch": 0.5616467128754463,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.000497429482801955,
|
|
"loss": 5.8087,
|
|
"mean_token_accuracy": 0.14342449381947517,
|
|
"num_tokens": 12329518.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"entropy": 5.740103340148925,
|
|
"epoch": 0.5620667926906112,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004974249670889111,
|
|
"loss": 5.6224,
|
|
"mean_token_accuracy": 0.14958661496639253,
|
|
"num_tokens": 12338244.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"entropy": 5.922348546981811,
|
|
"epoch": 0.5624868725057761,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004974204474357237,
|
|
"loss": 5.8038,
|
|
"mean_token_accuracy": 0.14635297060012817,
|
|
"num_tokens": 12347962.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"entropy": 5.858186721801758,
|
|
"epoch": 0.562906952320941,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004974159238424723,
|
|
"loss": 5.7152,
|
|
"mean_token_accuracy": 0.14685503840446473,
|
|
"num_tokens": 12357020.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"entropy": 5.7741382122039795,
|
|
"epoch": 0.5633270321361059,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004974113963092376,
|
|
"loss": 5.6818,
|
|
"mean_token_accuracy": 0.15062317550182341,
|
|
"num_tokens": 12366108.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"entropy": 5.800469636917114,
|
|
"epoch": 0.5637471119512707,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004974068648360995,
|
|
"loss": 5.6174,
|
|
"mean_token_accuracy": 0.16103638261556624,
|
|
"num_tokens": 12374508.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"entropy": 5.716407442092896,
|
|
"epoch": 0.5641671917664356,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004974023294231383,
|
|
"loss": 5.6307,
|
|
"mean_token_accuracy": 0.15565890967845916,
|
|
"num_tokens": 12383555.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"entropy": 5.739489316940308,
|
|
"epoch": 0.5645872715816005,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004973977900704342,
|
|
"loss": 5.7318,
|
|
"mean_token_accuracy": 0.14744656831026076,
|
|
"num_tokens": 12392680.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"entropy": 5.8620096206665036,
|
|
"epoch": 0.5650073513967654,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004973932467780679,
|
|
"loss": 5.7678,
|
|
"mean_token_accuracy": 0.14722730666399003,
|
|
"num_tokens": 12401881.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"entropy": 5.841188859939575,
|
|
"epoch": 0.5654274312119303,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973886995461197,
|
|
"loss": 5.7751,
|
|
"mean_token_accuracy": 0.14041661322116852,
|
|
"num_tokens": 12411487.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"entropy": 5.73593955039978,
|
|
"epoch": 0.5658475110270952,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004973841483746703,
|
|
"loss": 5.5558,
|
|
"mean_token_accuracy": 0.1630205363035202,
|
|
"num_tokens": 12420376.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"entropy": 5.657348871231079,
|
|
"epoch": 0.5662675908422601,
|
|
"grad_norm": 1.234375,
|
|
"learning_rate": 0.0004973795932638001,
|
|
"loss": 5.5997,
|
|
"mean_token_accuracy": 0.1595743790268898,
|
|
"num_tokens": 12429518.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"entropy": 5.758114671707153,
|
|
"epoch": 0.5666876706574249,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.00049737503421359,
|
|
"loss": 5.591,
|
|
"mean_token_accuracy": 0.15658295452594756,
|
|
"num_tokens": 12438952.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"entropy": 5.740869188308716,
|
|
"epoch": 0.5671077504725898,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004973704712241206,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.15196311324834824,
|
|
"num_tokens": 12448576.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"entropy": 5.697316026687622,
|
|
"epoch": 0.5675278302877547,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004973659042954729,
|
|
"loss": 5.651,
|
|
"mean_token_accuracy": 0.1531267300248146,
|
|
"num_tokens": 12458166.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"entropy": 5.621774578094483,
|
|
"epoch": 0.5679479101029196,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004973613334277277,
|
|
"loss": 5.5773,
|
|
"mean_token_accuracy": 0.15975697934627534,
|
|
"num_tokens": 12467271.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"entropy": 5.788559246063232,
|
|
"epoch": 0.5683679899180845,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004973567586209658,
|
|
"loss": 5.7353,
|
|
"mean_token_accuracy": 0.14790709912776948,
|
|
"num_tokens": 12476255.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"entropy": 5.805595254898071,
|
|
"epoch": 0.5687880697332494,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.0004973521798752686,
|
|
"loss": 5.6924,
|
|
"mean_token_accuracy": 0.14894914180040358,
|
|
"num_tokens": 12485096.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"entropy": 5.907484722137451,
|
|
"epoch": 0.5692081495484141,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000497347597190717,
|
|
"loss": 5.7415,
|
|
"mean_token_accuracy": 0.1513557866215706,
|
|
"num_tokens": 12494405.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"entropy": 5.706912994384766,
|
|
"epoch": 0.569628229363579,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973430105673921,
|
|
"loss": 5.664,
|
|
"mean_token_accuracy": 0.14975441992282867,
|
|
"num_tokens": 12503349.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"entropy": 5.7776655673980715,
|
|
"epoch": 0.5700483091787439,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004973384200053754,
|
|
"loss": 5.7322,
|
|
"mean_token_accuracy": 0.14976128786802292,
|
|
"num_tokens": 12513122.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"entropy": 5.756508922576904,
|
|
"epoch": 0.5704683889939088,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000497333825504748,
|
|
"loss": 5.6834,
|
|
"mean_token_accuracy": 0.15084973052144052,
|
|
"num_tokens": 12523614.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"entropy": 5.739598941802979,
|
|
"epoch": 0.5708884688090737,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004973292270655914,
|
|
"loss": 5.7215,
|
|
"mean_token_accuracy": 0.1441544845700264,
|
|
"num_tokens": 12532031.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"entropy": 5.866616153717041,
|
|
"epoch": 0.5713085486242386,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000497324624687987,
|
|
"loss": 5.8053,
|
|
"mean_token_accuracy": 0.14371495842933654,
|
|
"num_tokens": 12542239.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"entropy": 5.8771069049835205,
|
|
"epoch": 0.5717286284394035,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004973200183720164,
|
|
"loss": 5.7128,
|
|
"mean_token_accuracy": 0.14354896992444993,
|
|
"num_tokens": 12552608.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"entropy": 5.6871239185333256,
|
|
"epoch": 0.5721487082545683,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004973154081177611,
|
|
"loss": 5.5667,
|
|
"mean_token_accuracy": 0.15022857040166854,
|
|
"num_tokens": 12562020.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"entropy": 5.765218448638916,
|
|
"epoch": 0.5725687880697332,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004973107939253027,
|
|
"loss": 5.6358,
|
|
"mean_token_accuracy": 0.16130239069461821,
|
|
"num_tokens": 12570519.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"entropy": 5.67629337310791,
|
|
"epoch": 0.5729888678848981,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004973061757947233,
|
|
"loss": 5.6383,
|
|
"mean_token_accuracy": 0.15358631312847137,
|
|
"num_tokens": 12579324.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"entropy": 5.70035400390625,
|
|
"epoch": 0.573408947700063,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004973015537261043,
|
|
"loss": 5.7002,
|
|
"mean_token_accuracy": 0.15523785203695298,
|
|
"num_tokens": 12588014.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"entropy": 5.787239074707031,
|
|
"epoch": 0.5738290275152279,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004972969277195279,
|
|
"loss": 5.6983,
|
|
"mean_token_accuracy": 0.15244747400283815,
|
|
"num_tokens": 12596882.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"entropy": 5.7592627048492435,
|
|
"epoch": 0.5742491073303928,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004972922977750757,
|
|
"loss": 5.6187,
|
|
"mean_token_accuracy": 0.15164665579795839,
|
|
"num_tokens": 12606069.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"entropy": 5.75448842048645,
|
|
"epoch": 0.5746691871455577,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.00049728766389283,
|
|
"loss": 5.6692,
|
|
"mean_token_accuracy": 0.1461893856525421,
|
|
"num_tokens": 12615167.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"entropy": 5.7733536720275875,
|
|
"epoch": 0.5750892669607225,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004972830260728729,
|
|
"loss": 5.6936,
|
|
"mean_token_accuracy": 0.15012103021144868,
|
|
"num_tokens": 12624230.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"entropy": 5.8393168449401855,
|
|
"epoch": 0.5755093467758874,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004972783843152863,
|
|
"loss": 5.6759,
|
|
"mean_token_accuracy": 0.15672653466463088,
|
|
"num_tokens": 12633158.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"entropy": 5.7473838329315186,
|
|
"epoch": 0.5759294265910523,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004972737386201527,
|
|
"loss": 5.6159,
|
|
"mean_token_accuracy": 0.1514609858393669,
|
|
"num_tokens": 12641465.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"entropy": 5.694714307785034,
|
|
"epoch": 0.5763495064062172,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004972690889875541,
|
|
"loss": 5.576,
|
|
"mean_token_accuracy": 0.1582634076476097,
|
|
"num_tokens": 12650437.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"entropy": 5.887161779403686,
|
|
"epoch": 0.5767695862213821,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004972644354175732,
|
|
"loss": 5.8013,
|
|
"mean_token_accuracy": 0.1454438552260399,
|
|
"num_tokens": 12660072.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"entropy": 5.8891956329345705,
|
|
"epoch": 0.577189666036547,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004972597779102922,
|
|
"loss": 5.8047,
|
|
"mean_token_accuracy": 0.1495344288647175,
|
|
"num_tokens": 12670405.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"entropy": 5.794363021850586,
|
|
"epoch": 0.5776097458517119,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004972551164657937,
|
|
"loss": 5.6863,
|
|
"mean_token_accuracy": 0.1481093443930149,
|
|
"num_tokens": 12679992.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"entropy": 5.821959114074707,
|
|
"epoch": 0.5780298256668767,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004972504510841602,
|
|
"loss": 5.741,
|
|
"mean_token_accuracy": 0.14912082627415657,
|
|
"num_tokens": 12690289.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"entropy": 5.85949215888977,
|
|
"epoch": 0.5784499054820416,
|
|
"grad_norm": 1.1875,
|
|
"learning_rate": 0.0004972457817654745,
|
|
"loss": 5.7464,
|
|
"mean_token_accuracy": 0.14608799815177917,
|
|
"num_tokens": 12700518.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"entropy": 5.854418659210205,
|
|
"epoch": 0.5788699852972065,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004972411085098191,
|
|
"loss": 5.8047,
|
|
"mean_token_accuracy": 0.13859488815069199,
|
|
"num_tokens": 12710603.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"entropy": 5.80083737373352,
|
|
"epoch": 0.5792900651123714,
|
|
"grad_norm": 1.2109375,
|
|
"learning_rate": 0.000497236431317277,
|
|
"loss": 5.6832,
|
|
"mean_token_accuracy": 0.15335874259471893,
|
|
"num_tokens": 12719298.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"entropy": 5.793888568878174,
|
|
"epoch": 0.5797101449275363,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000497231750187931,
|
|
"loss": 5.679,
|
|
"mean_token_accuracy": 0.14987643510103227,
|
|
"num_tokens": 12728368.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"entropy": 5.803576946258545,
|
|
"epoch": 0.5801302247427012,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004972270651218638,
|
|
"loss": 5.7501,
|
|
"mean_token_accuracy": 0.15091048330068588,
|
|
"num_tokens": 12737898.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"entropy": 5.823219203948975,
|
|
"epoch": 0.580550304557866,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004972223761191587,
|
|
"loss": 5.6763,
|
|
"mean_token_accuracy": 0.14775750860571862,
|
|
"num_tokens": 12746761.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"entropy": 5.743349599838257,
|
|
"epoch": 0.5809703843730308,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004972176831798986,
|
|
"loss": 5.598,
|
|
"mean_token_accuracy": 0.16036793515086173,
|
|
"num_tokens": 12755128.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"entropy": 5.7417994976043705,
|
|
"epoch": 0.5813904641881957,
|
|
"grad_norm": 1.1640625,
|
|
"learning_rate": 0.0004972129863041667,
|
|
"loss": 5.7851,
|
|
"mean_token_accuracy": 0.14712280929088592,
|
|
"num_tokens": 12764727.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"entropy": 5.813629150390625,
|
|
"epoch": 0.5818105440033606,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004972082854920462,
|
|
"loss": 5.6555,
|
|
"mean_token_accuracy": 0.15503355711698533,
|
|
"num_tokens": 12773557.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"entropy": 5.765597820281982,
|
|
"epoch": 0.5822306238185255,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004972035807436203,
|
|
"loss": 5.6281,
|
|
"mean_token_accuracy": 0.15963127166032792,
|
|
"num_tokens": 12782525.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"entropy": 5.826959037780762,
|
|
"epoch": 0.5826507036336904,
|
|
"grad_norm": 1.21875,
|
|
"learning_rate": 0.0004971988720589723,
|
|
"loss": 5.7343,
|
|
"mean_token_accuracy": 0.14858472794294358,
|
|
"num_tokens": 12791534.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"entropy": 5.811407089233398,
|
|
"epoch": 0.5830707834488553,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004971941594381858,
|
|
"loss": 5.6378,
|
|
"mean_token_accuracy": 0.15586716905236245,
|
|
"num_tokens": 12800662.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"entropy": 5.8181867599487305,
|
|
"epoch": 0.5834908632640201,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004971894428813441,
|
|
"loss": 5.6793,
|
|
"mean_token_accuracy": 0.15237494111061095,
|
|
"num_tokens": 12809440.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"entropy": 5.825567674636841,
|
|
"epoch": 0.583910943079185,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000497184722388531,
|
|
"loss": 5.7669,
|
|
"mean_token_accuracy": 0.14334406405687333,
|
|
"num_tokens": 12818560.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"entropy": 5.836616373062133,
|
|
"epoch": 0.5843310228943499,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004971799979598297,
|
|
"loss": 5.7006,
|
|
"mean_token_accuracy": 0.14847517311573027,
|
|
"num_tokens": 12827898.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"entropy": 5.705964803695679,
|
|
"epoch": 0.5847511027095148,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004971752695953243,
|
|
"loss": 5.6278,
|
|
"mean_token_accuracy": 0.15511309504508972,
|
|
"num_tokens": 12837199.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"entropy": 5.776910972595215,
|
|
"epoch": 0.5851711825246797,
|
|
"grad_norm": 1.3125,
|
|
"learning_rate": 0.0004971705372950984,
|
|
"loss": 5.6685,
|
|
"mean_token_accuracy": 0.15222510397434236,
|
|
"num_tokens": 12846493.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"entropy": 5.805812168121338,
|
|
"epoch": 0.5855912623398446,
|
|
"grad_norm": 1.203125,
|
|
"learning_rate": 0.0004971658010592358,
|
|
"loss": 5.661,
|
|
"mean_token_accuracy": 0.14836213737726212,
|
|
"num_tokens": 12855026.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"entropy": 5.7935222625732425,
|
|
"epoch": 0.5860113421550095,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004971610608878205,
|
|
"loss": 5.7399,
|
|
"mean_token_accuracy": 0.14885252118110656,
|
|
"num_tokens": 12864563.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"entropy": 5.890350008010865,
|
|
"epoch": 0.5864314219701743,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004971563167809363,
|
|
"loss": 5.7027,
|
|
"mean_token_accuracy": 0.15145302265882493,
|
|
"num_tokens": 12874358.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"entropy": 5.775705432891845,
|
|
"epoch": 0.5868515017853392,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004971515687386674,
|
|
"loss": 5.6936,
|
|
"mean_token_accuracy": 0.14974414557218552,
|
|
"num_tokens": 12883110.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"entropy": 5.7816088676452635,
|
|
"epoch": 0.5872715816005041,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004971468167610978,
|
|
"loss": 5.7569,
|
|
"mean_token_accuracy": 0.15565478801727295,
|
|
"num_tokens": 12892977.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"entropy": 5.780284309387207,
|
|
"epoch": 0.587691661415669,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004971420608483117,
|
|
"loss": 5.557,
|
|
"mean_token_accuracy": 0.15739939212799073,
|
|
"num_tokens": 12902327.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"entropy": 5.6380256652832035,
|
|
"epoch": 0.5881117412308339,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.0004971373010003936,
|
|
"loss": 5.5792,
|
|
"mean_token_accuracy": 0.16282830759882927,
|
|
"num_tokens": 12911957.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"entropy": 5.734653520584106,
|
|
"epoch": 0.5885318210459988,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004971325372174274,
|
|
"loss": 5.6761,
|
|
"mean_token_accuracy": 0.14648626297712325,
|
|
"num_tokens": 12920380.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"entropy": 5.786434602737427,
|
|
"epoch": 0.5889519008611637,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004971277694994976,
|
|
"loss": 5.7325,
|
|
"mean_token_accuracy": 0.15107914805412292,
|
|
"num_tokens": 12929670.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"entropy": 5.806161260604858,
|
|
"epoch": 0.5893719806763285,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000497122997846689,
|
|
"loss": 5.6395,
|
|
"mean_token_accuracy": 0.15913416296243668,
|
|
"num_tokens": 12938185.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"entropy": 5.785260486602783,
|
|
"epoch": 0.5897920604914934,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004971182222590857,
|
|
"loss": 5.6737,
|
|
"mean_token_accuracy": 0.15860777348279953,
|
|
"num_tokens": 12947706.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"entropy": 5.699825477600098,
|
|
"epoch": 0.5902121403066583,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004971134427367725,
|
|
"loss": 5.6616,
|
|
"mean_token_accuracy": 0.15406165570020675,
|
|
"num_tokens": 12957393.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"entropy": 5.792528390884399,
|
|
"epoch": 0.5906322201218231,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000497108659279834,
|
|
"loss": 5.5571,
|
|
"mean_token_accuracy": 0.1610640689730644,
|
|
"num_tokens": 12967165.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"entropy": 5.813483619689942,
|
|
"epoch": 0.591052299936988,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004971038718883551,
|
|
"loss": 5.6962,
|
|
"mean_token_accuracy": 0.144030924141407,
|
|
"num_tokens": 12976490.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"entropy": 5.810479640960693,
|
|
"epoch": 0.5914723797521529,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004970990805624203,
|
|
"loss": 5.6798,
|
|
"mean_token_accuracy": 0.14762643873691558,
|
|
"num_tokens": 12985423.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"entropy": 5.7493095874786375,
|
|
"epoch": 0.5918924595673178,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004970942853021147,
|
|
"loss": 5.586,
|
|
"mean_token_accuracy": 0.15507588982582093,
|
|
"num_tokens": 12994510.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"entropy": 5.759291505813598,
|
|
"epoch": 0.5923125393824826,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004970894861075232,
|
|
"loss": 5.6933,
|
|
"mean_token_accuracy": 0.15216402411460878,
|
|
"num_tokens": 13003383.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"entropy": 5.7968847274780275,
|
|
"epoch": 0.5927326191976475,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004970846829787309,
|
|
"loss": 5.6424,
|
|
"mean_token_accuracy": 0.15637523382902146,
|
|
"num_tokens": 13012550.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"entropy": 5.814409351348877,
|
|
"epoch": 0.5931526990128124,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004970798759158227,
|
|
"loss": 5.717,
|
|
"mean_token_accuracy": 0.14880919829010963,
|
|
"num_tokens": 13022066.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"entropy": 5.810356950759887,
|
|
"epoch": 0.5935727788279773,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004970750649188839,
|
|
"loss": 5.6906,
|
|
"mean_token_accuracy": 0.1568908266723156,
|
|
"num_tokens": 13031008.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"entropy": 5.721108341217041,
|
|
"epoch": 0.5939928586431422,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004970702499879998,
|
|
"loss": 5.6632,
|
|
"mean_token_accuracy": 0.15371381118893623,
|
|
"num_tokens": 13040366.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"entropy": 5.732092952728271,
|
|
"epoch": 0.5944129384583071,
|
|
"grad_norm": 1.28125,
|
|
"learning_rate": 0.0004970654311232554,
|
|
"loss": 5.6904,
|
|
"mean_token_accuracy": 0.15139214396476747,
|
|
"num_tokens": 13051140.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"entropy": 5.8530854225158695,
|
|
"epoch": 0.594833018273472,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004970606083247362,
|
|
"loss": 5.6124,
|
|
"mean_token_accuracy": 0.1549868643283844,
|
|
"num_tokens": 13059835.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"entropy": 5.682567930221557,
|
|
"epoch": 0.5952530980886368,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004970557815925278,
|
|
"loss": 5.5625,
|
|
"mean_token_accuracy": 0.15498380362987518,
|
|
"num_tokens": 13068909.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"entropy": 5.665302896499634,
|
|
"epoch": 0.5956731779038017,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004970509509267155,
|
|
"loss": 5.6395,
|
|
"mean_token_accuracy": 0.15328293964266776,
|
|
"num_tokens": 13078380.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"entropy": 5.856351041793824,
|
|
"epoch": 0.5960932577189666,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004970461163273849,
|
|
"loss": 5.6873,
|
|
"mean_token_accuracy": 0.15368358492851258,
|
|
"num_tokens": 13087774.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"entropy": 5.7456623077392575,
|
|
"epoch": 0.5965133375341315,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004970412777946219,
|
|
"loss": 5.5125,
|
|
"mean_token_accuracy": 0.1564317002892494,
|
|
"num_tokens": 13095938.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"entropy": 5.6623311042785645,
|
|
"epoch": 0.5969334173492964,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004970364353285117,
|
|
"loss": 5.6649,
|
|
"mean_token_accuracy": 0.15760405361652374,
|
|
"num_tokens": 13104661.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"entropy": 5.799459838867188,
|
|
"epoch": 0.5973534971644613,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004970315889291405,
|
|
"loss": 5.6498,
|
|
"mean_token_accuracy": 0.1500176966190338,
|
|
"num_tokens": 13114505.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"entropy": 5.67414870262146,
|
|
"epoch": 0.5977735769796261,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004970267385965941,
|
|
"loss": 5.5967,
|
|
"mean_token_accuracy": 0.15627617239952088,
|
|
"num_tokens": 13124590.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"entropy": 5.709624147415161,
|
|
"epoch": 0.598193656794791,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004970218843309583,
|
|
"loss": 5.5727,
|
|
"mean_token_accuracy": 0.16128048151731492,
|
|
"num_tokens": 13134026.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"entropy": 5.858076572418213,
|
|
"epoch": 0.5986137366099559,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004970170261323192,
|
|
"loss": 5.7412,
|
|
"mean_token_accuracy": 0.15393318980932236,
|
|
"num_tokens": 13142654.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"entropy": 5.691959428787231,
|
|
"epoch": 0.5990338164251208,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004970121640007627,
|
|
"loss": 5.6406,
|
|
"mean_token_accuracy": 0.1516393780708313,
|
|
"num_tokens": 13151177.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"entropy": 5.724456405639648,
|
|
"epoch": 0.5994538962402857,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004970072979363751,
|
|
"loss": 5.6415,
|
|
"mean_token_accuracy": 0.15101922899484635,
|
|
"num_tokens": 13159689.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"entropy": 5.751378107070923,
|
|
"epoch": 0.5998739760554506,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004970024279392425,
|
|
"loss": 5.6808,
|
|
"mean_token_accuracy": 0.14915067553520203,
|
|
"num_tokens": 13168601.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"entropy": 5.811783504486084,
|
|
"epoch": 0.6002940558706155,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004969975540094513,
|
|
"loss": 5.65,
|
|
"mean_token_accuracy": 0.15383494645357132,
|
|
"num_tokens": 13177035.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"entropy": 5.808345937728882,
|
|
"epoch": 0.6007141356857802,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004969926761470876,
|
|
"loss": 5.6175,
|
|
"mean_token_accuracy": 0.16169959604740142,
|
|
"num_tokens": 13185444.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"entropy": 5.739962339401245,
|
|
"epoch": 0.6011342155009451,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.000496987794352238,
|
|
"loss": 5.6269,
|
|
"mean_token_accuracy": 0.158029805123806,
|
|
"num_tokens": 13194987.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"entropy": 5.7030730724334715,
|
|
"epoch": 0.60155429531611,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004969829086249889,
|
|
"loss": 5.6506,
|
|
"mean_token_accuracy": 0.14698696434497832,
|
|
"num_tokens": 13203807.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"entropy": 5.818206834793091,
|
|
"epoch": 0.6019743751312749,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000496978018965427,
|
|
"loss": 5.759,
|
|
"mean_token_accuracy": 0.14981004670262338,
|
|
"num_tokens": 13214362.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"entropy": 5.880270910263062,
|
|
"epoch": 0.6023944549464398,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004969731253736387,
|
|
"loss": 5.7679,
|
|
"mean_token_accuracy": 0.1471377916634083,
|
|
"num_tokens": 13224192.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"entropy": 5.780945491790772,
|
|
"epoch": 0.6028145347616047,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004969682278497109,
|
|
"loss": 5.7083,
|
|
"mean_token_accuracy": 0.15497646927833558,
|
|
"num_tokens": 13234430.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"entropy": 5.719972944259643,
|
|
"epoch": 0.6032346145767696,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004969633263937301,
|
|
"loss": 5.6096,
|
|
"mean_token_accuracy": 0.15373648703098297,
|
|
"num_tokens": 13243681.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"entropy": 5.953779697418213,
|
|
"epoch": 0.6036546943919344,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004969584210057832,
|
|
"loss": 5.9049,
|
|
"mean_token_accuracy": 0.14223207384347916,
|
|
"num_tokens": 13254334.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"entropy": 5.90763521194458,
|
|
"epoch": 0.6040747742070993,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004969535116859573,
|
|
"loss": 5.69,
|
|
"mean_token_accuracy": 0.15484999641776084,
|
|
"num_tokens": 13263781.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"entropy": 5.698655986785889,
|
|
"epoch": 0.6044948540222642,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004969485984343392,
|
|
"loss": 5.623,
|
|
"mean_token_accuracy": 0.153725266456604,
|
|
"num_tokens": 13272831.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"entropy": 5.807196807861328,
|
|
"epoch": 0.6049149338374291,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000496943681251016,
|
|
"loss": 5.6458,
|
|
"mean_token_accuracy": 0.15263475701212884,
|
|
"num_tokens": 13281621.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"entropy": 5.737537956237793,
|
|
"epoch": 0.605335013652594,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004969387601360747,
|
|
"loss": 5.6339,
|
|
"mean_token_accuracy": 0.1460575617849827,
|
|
"num_tokens": 13291021.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"entropy": 5.792116641998291,
|
|
"epoch": 0.6057550934677589,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004969338350896026,
|
|
"loss": 5.6657,
|
|
"mean_token_accuracy": 0.15297241359949112,
|
|
"num_tokens": 13299752.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"entropy": 5.843834066390992,
|
|
"epoch": 0.6061751732829238,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004969289061116869,
|
|
"loss": 5.692,
|
|
"mean_token_accuracy": 0.14698407873511316,
|
|
"num_tokens": 13309112.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"entropy": 5.78744421005249,
|
|
"epoch": 0.6065952530980886,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004969239732024148,
|
|
"loss": 5.6878,
|
|
"mean_token_accuracy": 0.15506349503993988,
|
|
"num_tokens": 13318328.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"entropy": 5.677953767776489,
|
|
"epoch": 0.6070153329132535,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004969190363618739,
|
|
"loss": 5.5847,
|
|
"mean_token_accuracy": 0.1533963978290558,
|
|
"num_tokens": 13328940.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"entropy": 5.71037449836731,
|
|
"epoch": 0.6074354127284184,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004969140955901516,
|
|
"loss": 5.6079,
|
|
"mean_token_accuracy": 0.15790644884109498,
|
|
"num_tokens": 13337829.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"entropy": 5.844385385513306,
|
|
"epoch": 0.6078554925435833,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004969091508873352,
|
|
"loss": 5.7794,
|
|
"mean_token_accuracy": 0.15029568076133729,
|
|
"num_tokens": 13348289.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"entropy": 5.801990795135498,
|
|
"epoch": 0.6082755723587482,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004969042022535126,
|
|
"loss": 5.6815,
|
|
"mean_token_accuracy": 0.15717827379703522,
|
|
"num_tokens": 13357292.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"entropy": 5.778904294967651,
|
|
"epoch": 0.6086956521739131,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004968992496887713,
|
|
"loss": 5.7374,
|
|
"mean_token_accuracy": 0.1471494920551777,
|
|
"num_tokens": 13366640.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"entropy": 5.804550743103027,
|
|
"epoch": 0.609115731989078,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004968942931931989,
|
|
"loss": 5.6431,
|
|
"mean_token_accuracy": 0.15975850373506545,
|
|
"num_tokens": 13377509.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"entropy": 5.821708917617798,
|
|
"epoch": 0.6095358118042428,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004968893327668835,
|
|
"loss": 5.7281,
|
|
"mean_token_accuracy": 0.14594982862472533,
|
|
"num_tokens": 13386573.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"entropy": 5.695713996887207,
|
|
"epoch": 0.6099558916194077,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004968843684099128,
|
|
"loss": 5.5984,
|
|
"mean_token_accuracy": 0.15431449562311172,
|
|
"num_tokens": 13395790.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"entropy": 5.6999798774719235,
|
|
"epoch": 0.6103759714345726,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004968794001223747,
|
|
"loss": 5.659,
|
|
"mean_token_accuracy": 0.15051533728837968,
|
|
"num_tokens": 13405265.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"entropy": 5.771698093414306,
|
|
"epoch": 0.6107960512497375,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004968744279043574,
|
|
"loss": 5.6372,
|
|
"mean_token_accuracy": 0.1554905042052269,
|
|
"num_tokens": 13413796.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"entropy": 5.878541707992554,
|
|
"epoch": 0.6112161310649024,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004968694517559488,
|
|
"loss": 5.6955,
|
|
"mean_token_accuracy": 0.15228554159402846,
|
|
"num_tokens": 13423299.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"entropy": 5.71440634727478,
|
|
"epoch": 0.6116362108800673,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004968644716772371,
|
|
"loss": 5.6168,
|
|
"mean_token_accuracy": 0.15852915048599242,
|
|
"num_tokens": 13432267.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"entropy": 5.70712103843689,
|
|
"epoch": 0.612056290695232,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004968594876683105,
|
|
"loss": 5.7059,
|
|
"mean_token_accuracy": 0.1497867949306965,
|
|
"num_tokens": 13442332.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"entropy": 5.776857042312622,
|
|
"epoch": 0.6124763705103969,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004968544997292572,
|
|
"loss": 5.65,
|
|
"mean_token_accuracy": 0.1574311837553978,
|
|
"num_tokens": 13451700.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"entropy": 5.810136556625366,
|
|
"epoch": 0.6128964503255618,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004968495078601659,
|
|
"loss": 5.7362,
|
|
"mean_token_accuracy": 0.14313261955976486,
|
|
"num_tokens": 13461009.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"entropy": 5.852288770675659,
|
|
"epoch": 0.6133165301407267,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004968445120611247,
|
|
"loss": 5.7511,
|
|
"mean_token_accuracy": 0.15160316973924637,
|
|
"num_tokens": 13470341.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"entropy": 5.815583086013794,
|
|
"epoch": 0.6137366099558916,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004968395123322223,
|
|
"loss": 5.6802,
|
|
"mean_token_accuracy": 0.15470752790570258,
|
|
"num_tokens": 13479898.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"entropy": 5.749247694015503,
|
|
"epoch": 0.6141566897710565,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000496834508673547,
|
|
"loss": 5.584,
|
|
"mean_token_accuracy": 0.15403383672237397,
|
|
"num_tokens": 13488116.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"entropy": 5.759438514709473,
|
|
"epoch": 0.6145767695862214,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004968295010851877,
|
|
"loss": 5.6356,
|
|
"mean_token_accuracy": 0.15597606748342513,
|
|
"num_tokens": 13497814.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"entropy": 5.793872499465943,
|
|
"epoch": 0.6149968494013862,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004968244895672331,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.14847874492406846,
|
|
"num_tokens": 13506617.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"entropy": 5.775398635864258,
|
|
"epoch": 0.6154169292165511,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004968194741197718,
|
|
"loss": 5.788,
|
|
"mean_token_accuracy": 0.14676353633403777,
|
|
"num_tokens": 13516632.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"entropy": 5.834484338760376,
|
|
"epoch": 0.615837009031716,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004968144547428927,
|
|
"loss": 5.7015,
|
|
"mean_token_accuracy": 0.1523831970989704,
|
|
"num_tokens": 13526452.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"entropy": 5.787644147872925,
|
|
"epoch": 0.6162570888468809,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004968094314366848,
|
|
"loss": 5.6104,
|
|
"mean_token_accuracy": 0.1564450517296791,
|
|
"num_tokens": 13535663.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"entropy": 5.663765478134155,
|
|
"epoch": 0.6166771686620458,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000496804404201237,
|
|
"loss": 5.5183,
|
|
"mean_token_accuracy": 0.1632273629307747,
|
|
"num_tokens": 13544574.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"entropy": 5.863478326797486,
|
|
"epoch": 0.6170972484772107,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967993730366385,
|
|
"loss": 5.7136,
|
|
"mean_token_accuracy": 0.1504902571439743,
|
|
"num_tokens": 13553041.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"entropy": 5.755918788909912,
|
|
"epoch": 0.6175173282923756,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004967943379429781,
|
|
"loss": 5.6698,
|
|
"mean_token_accuracy": 0.1484751097857952,
|
|
"num_tokens": 13562108.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"entropy": 5.919857025146484,
|
|
"epoch": 0.6179374081075404,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004967892989203454,
|
|
"loss": 5.8215,
|
|
"mean_token_accuracy": 0.1457022547721863,
|
|
"num_tokens": 13571500.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"entropy": 5.858237886428833,
|
|
"epoch": 0.6183574879227053,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004967842559688295,
|
|
"loss": 5.7304,
|
|
"mean_token_accuracy": 0.1467614322900772,
|
|
"num_tokens": 13581304.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"entropy": 5.7586640357971195,
|
|
"epoch": 0.6187775677378702,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004967792090885195,
|
|
"loss": 5.6034,
|
|
"mean_token_accuracy": 0.15647478327155112,
|
|
"num_tokens": 13590734.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"entropy": 5.683297252655029,
|
|
"epoch": 0.6191976475530351,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004967741582795052,
|
|
"loss": 5.6578,
|
|
"mean_token_accuracy": 0.15261005461215973,
|
|
"num_tokens": 13600486.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"entropy": 5.855695104598999,
|
|
"epoch": 0.6196177273682,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004967691035418758,
|
|
"loss": 5.6842,
|
|
"mean_token_accuracy": 0.14570327550172807,
|
|
"num_tokens": 13610542.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"entropy": 5.7847048282623295,
|
|
"epoch": 0.6200378071833649,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.000496764044875721,
|
|
"loss": 5.6311,
|
|
"mean_token_accuracy": 0.15491739362478257,
|
|
"num_tokens": 13619431.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"entropy": 5.743926477432251,
|
|
"epoch": 0.6204578869985298,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004967589822811303,
|
|
"loss": 5.6796,
|
|
"mean_token_accuracy": 0.1506201907992363,
|
|
"num_tokens": 13629930.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"entropy": 5.907500982284546,
|
|
"epoch": 0.6208779668136946,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004967539157581934,
|
|
"loss": 5.811,
|
|
"mean_token_accuracy": 0.14877504855394363,
|
|
"num_tokens": 13639439.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"entropy": 5.859608602523804,
|
|
"epoch": 0.6212980466288595,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000496748845307,
|
|
"loss": 5.7233,
|
|
"mean_token_accuracy": 0.15566221624612808,
|
|
"num_tokens": 13648548.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"entropy": 5.76888256072998,
|
|
"epoch": 0.6217181264440244,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004967437709276401,
|
|
"loss": 5.7631,
|
|
"mean_token_accuracy": 0.15250276327133178,
|
|
"num_tokens": 13657658.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"entropy": 5.7022544860839846,
|
|
"epoch": 0.6221382062591893,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004967386926202034,
|
|
"loss": 5.5529,
|
|
"mean_token_accuracy": 0.15799887329339982,
|
|
"num_tokens": 13666763.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"entropy": 5.85063681602478,
|
|
"epoch": 0.6225582860743542,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00049673361038478,
|
|
"loss": 5.7924,
|
|
"mean_token_accuracy": 0.14320089891552926,
|
|
"num_tokens": 13676527.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"entropy": 5.783917856216431,
|
|
"epoch": 0.622978365889519,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004967285242214599,
|
|
"loss": 5.7395,
|
|
"mean_token_accuracy": 0.152333851903677,
|
|
"num_tokens": 13685404.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"entropy": 5.781331348419189,
|
|
"epoch": 0.6233984457046838,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000496723434130333,
|
|
"loss": 5.5582,
|
|
"mean_token_accuracy": 0.15735181719064711,
|
|
"num_tokens": 13693118.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"entropy": 5.746804714202881,
|
|
"epoch": 0.6238185255198487,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004967183401114898,
|
|
"loss": 5.6315,
|
|
"mean_token_accuracy": 0.1518068231642246,
|
|
"num_tokens": 13702015.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"entropy": 5.754427099227906,
|
|
"epoch": 0.6242386053350136,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004967132421650203,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.15001336112618446,
|
|
"num_tokens": 13711658.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"entropy": 5.663620853424073,
|
|
"epoch": 0.6246586851501785,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004967081402910149,
|
|
"loss": 5.6707,
|
|
"mean_token_accuracy": 0.1505231335759163,
|
|
"num_tokens": 13720718.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"entropy": 5.7571845054626465,
|
|
"epoch": 0.6250787649653434,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000496703034489564,
|
|
"loss": 5.5426,
|
|
"mean_token_accuracy": 0.15831270217895507,
|
|
"num_tokens": 13729364.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"entropy": 5.881094646453858,
|
|
"epoch": 0.6254988447805083,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004966979247607579,
|
|
"loss": 5.8386,
|
|
"mean_token_accuracy": 0.14335498884320258,
|
|
"num_tokens": 13739436.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"entropy": 5.8375767230987545,
|
|
"epoch": 0.6259189245956732,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004966928111046873,
|
|
"loss": 5.7527,
|
|
"mean_token_accuracy": 0.15689793825149537,
|
|
"num_tokens": 13749196.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"entropy": 5.786605453491211,
|
|
"epoch": 0.626339004410838,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004966876935214426,
|
|
"loss": 5.6034,
|
|
"mean_token_accuracy": 0.15611332505941392,
|
|
"num_tokens": 13758414.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"entropy": 5.758475589752197,
|
|
"epoch": 0.6267590842260029,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004966825720111147,
|
|
"loss": 5.6428,
|
|
"mean_token_accuracy": 0.1506047248840332,
|
|
"num_tokens": 13767496.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"entropy": 5.810634279251099,
|
|
"epoch": 0.6271791640411678,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004966774465737942,
|
|
"loss": 5.7739,
|
|
"mean_token_accuracy": 0.15019554272294044,
|
|
"num_tokens": 13777033.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"entropy": 5.827461242675781,
|
|
"epoch": 0.6275992438563327,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.0004966723172095717,
|
|
"loss": 5.7321,
|
|
"mean_token_accuracy": 0.14653541669249534,
|
|
"num_tokens": 13786313.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"entropy": 5.787878465652466,
|
|
"epoch": 0.6280193236714976,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004966671839185384,
|
|
"loss": 5.6609,
|
|
"mean_token_accuracy": 0.15652250424027442,
|
|
"num_tokens": 13795257.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"entropy": 5.715157699584961,
|
|
"epoch": 0.6284394034866625,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004966620467007851,
|
|
"loss": 5.5822,
|
|
"mean_token_accuracy": 0.15817383229732512,
|
|
"num_tokens": 13804582.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"entropy": 5.718271636962891,
|
|
"epoch": 0.6288594833018274,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004966569055564027,
|
|
"loss": 5.5588,
|
|
"mean_token_accuracy": 0.15505106449127198,
|
|
"num_tokens": 13813248.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"entropy": 5.8337007522583,
|
|
"epoch": 0.6292795631169922,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004966517604854823,
|
|
"loss": 5.8426,
|
|
"mean_token_accuracy": 0.13890477865934373,
|
|
"num_tokens": 13823301.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"entropy": 5.777707862854004,
|
|
"epoch": 0.6296996429321571,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004966466114881152,
|
|
"loss": 5.5601,
|
|
"mean_token_accuracy": 0.16184355765581132,
|
|
"num_tokens": 13832040.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"entropy": 5.751884889602661,
|
|
"epoch": 0.630119722747322,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004966414585643925,
|
|
"loss": 5.7486,
|
|
"mean_token_accuracy": 0.1461435317993164,
|
|
"num_tokens": 13841874.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"entropy": 5.743130207061768,
|
|
"epoch": 0.6305398025624869,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004966363017144055,
|
|
"loss": 5.5699,
|
|
"mean_token_accuracy": 0.1629691332578659,
|
|
"num_tokens": 13850755.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"entropy": 5.75084810256958,
|
|
"epoch": 0.6309598823776518,
|
|
"grad_norm": 1.2421875,
|
|
"learning_rate": 0.0004966311409382455,
|
|
"loss": 5.6466,
|
|
"mean_token_accuracy": 0.15385955274105073,
|
|
"num_tokens": 13860009.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"entropy": 5.6666340827941895,
|
|
"epoch": 0.6313799621928167,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004966259762360039,
|
|
"loss": 5.5514,
|
|
"mean_token_accuracy": 0.16524431109428406,
|
|
"num_tokens": 13868476.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"entropy": 5.6674237728118895,
|
|
"epoch": 0.6318000420079816,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004966208076077723,
|
|
"loss": 5.5884,
|
|
"mean_token_accuracy": 0.1589062973856926,
|
|
"num_tokens": 13877367.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"entropy": 5.764949607849121,
|
|
"epoch": 0.6322201218231464,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004966156350536422,
|
|
"loss": 5.657,
|
|
"mean_token_accuracy": 0.1521471455693245,
|
|
"num_tokens": 13885985.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"entropy": 5.7143798828125,
|
|
"epoch": 0.6326402016383113,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004966104585737054,
|
|
"loss": 5.5793,
|
|
"mean_token_accuracy": 0.1567079693078995,
|
|
"num_tokens": 13895059.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"entropy": 5.7325562000274655,
|
|
"epoch": 0.6330602814534761,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004966052781680534,
|
|
"loss": 5.639,
|
|
"mean_token_accuracy": 0.1523404136300087,
|
|
"num_tokens": 13903789.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"entropy": 5.77357177734375,
|
|
"epoch": 0.633480361268641,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004966000938367778,
|
|
"loss": 5.6303,
|
|
"mean_token_accuracy": 0.15177295506000518,
|
|
"num_tokens": 13913377.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"entropy": 5.674224042892456,
|
|
"epoch": 0.6339004410838059,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004965949055799708,
|
|
"loss": 5.5907,
|
|
"mean_token_accuracy": 0.16200509965419768,
|
|
"num_tokens": 13922141.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"entropy": 5.762849998474121,
|
|
"epoch": 0.6343205208989708,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004965897133977241,
|
|
"loss": 5.6311,
|
|
"mean_token_accuracy": 0.14147766605019568,
|
|
"num_tokens": 13930717.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"entropy": 5.803768348693848,
|
|
"epoch": 0.6347406007141357,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004965845172901298,
|
|
"loss": 5.7098,
|
|
"mean_token_accuracy": 0.15163939744234084,
|
|
"num_tokens": 13940344.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"entropy": 5.696092939376831,
|
|
"epoch": 0.6351606805293005,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004965793172572798,
|
|
"loss": 5.5525,
|
|
"mean_token_accuracy": 0.15789541453123093,
|
|
"num_tokens": 13948400.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"entropy": 5.68128399848938,
|
|
"epoch": 0.6355807603444654,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004965741132992663,
|
|
"loss": 5.6728,
|
|
"mean_token_accuracy": 0.14891874939203262,
|
|
"num_tokens": 13957939.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"entropy": 5.8004132270812985,
|
|
"epoch": 0.6360008401596303,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004965689054161814,
|
|
"loss": 5.6336,
|
|
"mean_token_accuracy": 0.15334389358758926,
|
|
"num_tokens": 13966943.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"entropy": 5.748394632339478,
|
|
"epoch": 0.6364209199747952,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004965636936081176,
|
|
"loss": 5.5608,
|
|
"mean_token_accuracy": 0.15567472875118255,
|
|
"num_tokens": 13975850.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"entropy": 5.780313920974732,
|
|
"epoch": 0.6368409997899601,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.000496558477875167,
|
|
"loss": 5.6499,
|
|
"mean_token_accuracy": 0.1615470714867115,
|
|
"num_tokens": 13985059.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"entropy": 5.7933934211730955,
|
|
"epoch": 0.637261079605125,
|
|
"grad_norm": 1.3203125,
|
|
"learning_rate": 0.000496553258217422,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.1459404468536377,
|
|
"num_tokens": 13993571.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"entropy": 5.8100879192352295,
|
|
"epoch": 0.6376811594202898,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004965480346349751,
|
|
"loss": 5.6982,
|
|
"mean_token_accuracy": 0.1491829678416252,
|
|
"num_tokens": 14002326.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"entropy": 5.909585475921631,
|
|
"epoch": 0.6381012392354547,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000496542807127919,
|
|
"loss": 5.8381,
|
|
"mean_token_accuracy": 0.1500192791223526,
|
|
"num_tokens": 14012002.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"entropy": 5.814299058914185,
|
|
"epoch": 0.6385213190506196,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000496537575696346,
|
|
"loss": 5.7004,
|
|
"mean_token_accuracy": 0.14627951383590698,
|
|
"num_tokens": 14022085.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"entropy": 5.694448709487915,
|
|
"epoch": 0.6389413988657845,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004965323403403488,
|
|
"loss": 5.5641,
|
|
"mean_token_accuracy": 0.15740595012903214,
|
|
"num_tokens": 14030706.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"entropy": 5.74091477394104,
|
|
"epoch": 0.6393614786809494,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004965271010600205,
|
|
"loss": 5.5936,
|
|
"mean_token_accuracy": 0.15839929282665252,
|
|
"num_tokens": 14039520.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"entropy": 5.824947929382324,
|
|
"epoch": 0.6397815584961143,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004965218578554535,
|
|
"loss": 5.7052,
|
|
"mean_token_accuracy": 0.15347498506307602,
|
|
"num_tokens": 14048407.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"entropy": 5.705046701431274,
|
|
"epoch": 0.6402016383112792,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.000496516610726741,
|
|
"loss": 5.6076,
|
|
"mean_token_accuracy": 0.15854684114456177,
|
|
"num_tokens": 14057534.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"entropy": 5.763433980941772,
|
|
"epoch": 0.640621718126444,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004965113596739759,
|
|
"loss": 5.5959,
|
|
"mean_token_accuracy": 0.1591039463877678,
|
|
"num_tokens": 14065992.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"entropy": 5.7118175506591795,
|
|
"epoch": 0.6410417979416089,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004965061046972508,
|
|
"loss": 5.5862,
|
|
"mean_token_accuracy": 0.1558775633573532,
|
|
"num_tokens": 14074806.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"entropy": 5.717222595214844,
|
|
"epoch": 0.6414618777567738,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004965008457966594,
|
|
"loss": 5.6274,
|
|
"mean_token_accuracy": 0.15323564708232879,
|
|
"num_tokens": 14083813.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"entropy": 5.750378274917603,
|
|
"epoch": 0.6418819575719387,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004964955829722945,
|
|
"loss": 5.5653,
|
|
"mean_token_accuracy": 0.15824178904294967,
|
|
"num_tokens": 14092193.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"entropy": 5.85125765800476,
|
|
"epoch": 0.6423020373871036,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004964903162242493,
|
|
"loss": 5.7825,
|
|
"mean_token_accuracy": 0.1446760669350624,
|
|
"num_tokens": 14102797.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"entropy": 5.775562286376953,
|
|
"epoch": 0.6427221172022685,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004964850455526173,
|
|
"loss": 5.6487,
|
|
"mean_token_accuracy": 0.1530699238181114,
|
|
"num_tokens": 14112226.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"entropy": 5.671306467056274,
|
|
"epoch": 0.6431421970174334,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004964797709574917,
|
|
"loss": 5.5723,
|
|
"mean_token_accuracy": 0.15474275052547454,
|
|
"num_tokens": 14121775.0,
|
|
"step": 7655
|
|
},
|
|
{
|
|
"entropy": 5.7099405288696286,
|
|
"epoch": 0.6435622768325981,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000496474492438966,
|
|
"loss": 5.5616,
|
|
"mean_token_accuracy": 0.15505539178848265,
|
|
"num_tokens": 14130415.0,
|
|
"step": 7660
|
|
},
|
|
{
|
|
"entropy": 5.719390869140625,
|
|
"epoch": 0.643982356647763,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004964692099971338,
|
|
"loss": 5.5889,
|
|
"mean_token_accuracy": 0.15883549749851228,
|
|
"num_tokens": 14140204.0,
|
|
"step": 7665
|
|
},
|
|
{
|
|
"entropy": 5.727085542678833,
|
|
"epoch": 0.6444024364629279,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004964639236320885,
|
|
"loss": 5.5326,
|
|
"mean_token_accuracy": 0.15387142151594163,
|
|
"num_tokens": 14149595.0,
|
|
"step": 7670
|
|
},
|
|
{
|
|
"entropy": 5.671725845336914,
|
|
"epoch": 0.6448225162780928,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004964586333439239,
|
|
"loss": 5.6096,
|
|
"mean_token_accuracy": 0.15158815383911134,
|
|
"num_tokens": 14158865.0,
|
|
"step": 7675
|
|
},
|
|
{
|
|
"entropy": 5.69843373298645,
|
|
"epoch": 0.6452425960932577,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004964533391327335,
|
|
"loss": 5.5645,
|
|
"mean_token_accuracy": 0.16603838801383972,
|
|
"num_tokens": 14167962.0,
|
|
"step": 7680
|
|
},
|
|
{
|
|
"entropy": 5.727479887008667,
|
|
"epoch": 0.6456626759084226,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004964480409986113,
|
|
"loss": 5.6263,
|
|
"mean_token_accuracy": 0.1631734326481819,
|
|
"num_tokens": 14176479.0,
|
|
"step": 7685
|
|
},
|
|
{
|
|
"entropy": 5.830864524841308,
|
|
"epoch": 0.6460827557235875,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004964427389416512,
|
|
"loss": 5.6441,
|
|
"mean_token_accuracy": 0.15119680762290955,
|
|
"num_tokens": 14185408.0,
|
|
"step": 7690
|
|
},
|
|
{
|
|
"entropy": 5.700068235397339,
|
|
"epoch": 0.6465028355387523,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000496437432961947,
|
|
"loss": 5.6396,
|
|
"mean_token_accuracy": 0.15550505965948105,
|
|
"num_tokens": 14194155.0,
|
|
"step": 7695
|
|
},
|
|
{
|
|
"entropy": 5.704675006866455,
|
|
"epoch": 0.6469229153539172,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004964321230595925,
|
|
"loss": 5.6697,
|
|
"mean_token_accuracy": 0.15572153329849242,
|
|
"num_tokens": 14202779.0,
|
|
"step": 7700
|
|
},
|
|
{
|
|
"entropy": 5.89739408493042,
|
|
"epoch": 0.6473429951690821,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004964268092346821,
|
|
"loss": 5.856,
|
|
"mean_token_accuracy": 0.1441698580980301,
|
|
"num_tokens": 14212552.0,
|
|
"step": 7705
|
|
},
|
|
{
|
|
"entropy": 5.918906402587891,
|
|
"epoch": 0.647763074984247,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004964214914873098,
|
|
"loss": 5.6441,
|
|
"mean_token_accuracy": 0.14508990049362183,
|
|
"num_tokens": 14222783.0,
|
|
"step": 7710
|
|
},
|
|
{
|
|
"entropy": 5.6578751564025875,
|
|
"epoch": 0.6481831547994119,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004964161698175697,
|
|
"loss": 5.5291,
|
|
"mean_token_accuracy": 0.15217974185943603,
|
|
"num_tokens": 14232085.0,
|
|
"step": 7715
|
|
},
|
|
{
|
|
"entropy": 5.666456794738769,
|
|
"epoch": 0.6486032346145768,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004964108442255562,
|
|
"loss": 5.6923,
|
|
"mean_token_accuracy": 0.14667234867811202,
|
|
"num_tokens": 14241969.0,
|
|
"step": 7720
|
|
},
|
|
{
|
|
"entropy": 5.712312459945679,
|
|
"epoch": 0.6490233144297417,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004964055147113637,
|
|
"loss": 5.6018,
|
|
"mean_token_accuracy": 0.15744369924068452,
|
|
"num_tokens": 14251012.0,
|
|
"step": 7725
|
|
},
|
|
{
|
|
"entropy": 5.881955242156982,
|
|
"epoch": 0.6494433942449065,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004964001812750864,
|
|
"loss": 5.7226,
|
|
"mean_token_accuracy": 0.14762117117643356,
|
|
"num_tokens": 14261110.0,
|
|
"step": 7730
|
|
},
|
|
{
|
|
"entropy": 5.768258047103882,
|
|
"epoch": 0.6498634740600714,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000496394843916819,
|
|
"loss": 5.7015,
|
|
"mean_token_accuracy": 0.14753929674625396,
|
|
"num_tokens": 14270869.0,
|
|
"step": 7735
|
|
},
|
|
{
|
|
"entropy": 5.736590814590454,
|
|
"epoch": 0.6502835538752363,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004963895026366558,
|
|
"loss": 5.6374,
|
|
"mean_token_accuracy": 0.14768418967723845,
|
|
"num_tokens": 14279607.0,
|
|
"step": 7740
|
|
},
|
|
{
|
|
"entropy": 5.7367908477783205,
|
|
"epoch": 0.6507036336904012,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004963841574346917,
|
|
"loss": 5.6264,
|
|
"mean_token_accuracy": 0.14881719276309013,
|
|
"num_tokens": 14289282.0,
|
|
"step": 7745
|
|
},
|
|
{
|
|
"entropy": 5.697269630432129,
|
|
"epoch": 0.6511237135055661,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004963788083110212,
|
|
"loss": 5.5701,
|
|
"mean_token_accuracy": 0.15828859508037568,
|
|
"num_tokens": 14298658.0,
|
|
"step": 7750
|
|
},
|
|
{
|
|
"entropy": 5.868004083633423,
|
|
"epoch": 0.651543793320731,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.000496373455265739,
|
|
"loss": 5.6507,
|
|
"mean_token_accuracy": 0.1502188928425312,
|
|
"num_tokens": 14307832.0,
|
|
"step": 7755
|
|
},
|
|
{
|
|
"entropy": 5.7541135311126705,
|
|
"epoch": 0.6519638731358958,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004963680982989402,
|
|
"loss": 5.5594,
|
|
"mean_token_accuracy": 0.15743454396724701,
|
|
"num_tokens": 14317122.0,
|
|
"step": 7760
|
|
},
|
|
{
|
|
"entropy": 5.668973541259765,
|
|
"epoch": 0.6523839529510607,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004963627374107195,
|
|
"loss": 5.6035,
|
|
"mean_token_accuracy": 0.15760391652584077,
|
|
"num_tokens": 14326069.0,
|
|
"step": 7765
|
|
},
|
|
{
|
|
"entropy": 5.701200485229492,
|
|
"epoch": 0.6528040327662256,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004963573726011717,
|
|
"loss": 5.604,
|
|
"mean_token_accuracy": 0.1542496606707573,
|
|
"num_tokens": 14335260.0,
|
|
"step": 7770
|
|
},
|
|
{
|
|
"entropy": 5.85819001197815,
|
|
"epoch": 0.6532241125813905,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004963520038703922,
|
|
"loss": 5.6911,
|
|
"mean_token_accuracy": 0.14053659662604331,
|
|
"num_tokens": 14345823.0,
|
|
"step": 7775
|
|
},
|
|
{
|
|
"entropy": 5.743032455444336,
|
|
"epoch": 0.6536441923965554,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.000496346631218476,
|
|
"loss": 5.5588,
|
|
"mean_token_accuracy": 0.15523895770311355,
|
|
"num_tokens": 14354316.0,
|
|
"step": 7780
|
|
},
|
|
{
|
|
"entropy": 5.75605149269104,
|
|
"epoch": 0.6540642722117203,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.000496341254645518,
|
|
"loss": 5.6096,
|
|
"mean_token_accuracy": 0.1583119735121727,
|
|
"num_tokens": 14364539.0,
|
|
"step": 7785
|
|
},
|
|
{
|
|
"entropy": 5.746860456466675,
|
|
"epoch": 0.6544843520268852,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004963358741516138,
|
|
"loss": 5.728,
|
|
"mean_token_accuracy": 0.14681073948740958,
|
|
"num_tokens": 14374081.0,
|
|
"step": 7790
|
|
},
|
|
{
|
|
"entropy": 5.743828630447387,
|
|
"epoch": 0.6549044318420499,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004963304897368585,
|
|
"loss": 5.6157,
|
|
"mean_token_accuracy": 0.15063438564538956,
|
|
"num_tokens": 14383255.0,
|
|
"step": 7795
|
|
},
|
|
{
|
|
"entropy": 5.8674695014953615,
|
|
"epoch": 0.6553245116572148,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004963251014013475,
|
|
"loss": 5.7454,
|
|
"mean_token_accuracy": 0.1499713510274887,
|
|
"num_tokens": 14392417.0,
|
|
"step": 7800
|
|
},
|
|
{
|
|
"entropy": 5.904078197479248,
|
|
"epoch": 0.6557445914723797,
|
|
"grad_norm": 3.15625,
|
|
"learning_rate": 0.0004963197091451763,
|
|
"loss": 5.7859,
|
|
"mean_token_accuracy": 0.13848365619778633,
|
|
"num_tokens": 14401899.0,
|
|
"step": 7805
|
|
},
|
|
{
|
|
"entropy": 5.860718584060669,
|
|
"epoch": 0.6561646712875446,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004963143129684405,
|
|
"loss": 5.7538,
|
|
"mean_token_accuracy": 0.14672411531209945,
|
|
"num_tokens": 14411245.0,
|
|
"step": 7810
|
|
},
|
|
{
|
|
"entropy": 5.703166818618774,
|
|
"epoch": 0.6565847511027095,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004963089128712355,
|
|
"loss": 5.6064,
|
|
"mean_token_accuracy": 0.15734720826148987,
|
|
"num_tokens": 14419710.0,
|
|
"step": 7815
|
|
},
|
|
{
|
|
"entropy": 5.71895399093628,
|
|
"epoch": 0.6570048309178744,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004963035088536571,
|
|
"loss": 5.5915,
|
|
"mean_token_accuracy": 0.15892575681209564,
|
|
"num_tokens": 14430266.0,
|
|
"step": 7820
|
|
},
|
|
{
|
|
"entropy": 5.801663017272949,
|
|
"epoch": 0.6574249107330393,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004962981009158012,
|
|
"loss": 5.5594,
|
|
"mean_token_accuracy": 0.14950107038021088,
|
|
"num_tokens": 14439515.0,
|
|
"step": 7825
|
|
},
|
|
{
|
|
"entropy": 5.776130819320679,
|
|
"epoch": 0.6578449905482041,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004962926890577632,
|
|
"loss": 5.625,
|
|
"mean_token_accuracy": 0.15389978587627412,
|
|
"num_tokens": 14448091.0,
|
|
"step": 7830
|
|
},
|
|
{
|
|
"entropy": 5.71961612701416,
|
|
"epoch": 0.658265070363369,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 0.000496287273279639,
|
|
"loss": 5.6595,
|
|
"mean_token_accuracy": 0.14972689151763915,
|
|
"num_tokens": 14457744.0,
|
|
"step": 7835
|
|
},
|
|
{
|
|
"entropy": 5.747187852859497,
|
|
"epoch": 0.6586851501785339,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000496281853581525,
|
|
"loss": 5.6368,
|
|
"mean_token_accuracy": 0.15765562653541565,
|
|
"num_tokens": 14467597.0,
|
|
"step": 7840
|
|
},
|
|
{
|
|
"entropy": 5.736932706832886,
|
|
"epoch": 0.6591052299936988,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004962764299635168,
|
|
"loss": 5.6237,
|
|
"mean_token_accuracy": 0.15176722705364226,
|
|
"num_tokens": 14476662.0,
|
|
"step": 7845
|
|
},
|
|
{
|
|
"entropy": 5.84397029876709,
|
|
"epoch": 0.6595253098088637,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0004962710024257105,
|
|
"loss": 5.7182,
|
|
"mean_token_accuracy": 0.1437528148293495,
|
|
"num_tokens": 14486583.0,
|
|
"step": 7850
|
|
},
|
|
{
|
|
"entropy": 5.882024717330933,
|
|
"epoch": 0.6599453896240286,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004962655709682025,
|
|
"loss": 5.7135,
|
|
"mean_token_accuracy": 0.14967520385980607,
|
|
"num_tokens": 14496528.0,
|
|
"step": 7855
|
|
},
|
|
{
|
|
"entropy": 5.7820171356201175,
|
|
"epoch": 0.6603654694391935,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004962601355910887,
|
|
"loss": 5.6997,
|
|
"mean_token_accuracy": 0.14425860121846198,
|
|
"num_tokens": 14507026.0,
|
|
"step": 7860
|
|
},
|
|
{
|
|
"entropy": 5.658552885055542,
|
|
"epoch": 0.6607855492543583,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004962546962944656,
|
|
"loss": 5.5687,
|
|
"mean_token_accuracy": 0.1535755679011345,
|
|
"num_tokens": 14516480.0,
|
|
"step": 7865
|
|
},
|
|
{
|
|
"entropy": 5.741037130355835,
|
|
"epoch": 0.6612056290695232,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004962492530784295,
|
|
"loss": 5.5149,
|
|
"mean_token_accuracy": 0.16426774710416794,
|
|
"num_tokens": 14525068.0,
|
|
"step": 7870
|
|
},
|
|
{
|
|
"entropy": 5.7695718765258786,
|
|
"epoch": 0.6616257088846881,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004962438059430768,
|
|
"loss": 5.6342,
|
|
"mean_token_accuracy": 0.15432853996753693,
|
|
"num_tokens": 14534441.0,
|
|
"step": 7875
|
|
},
|
|
{
|
|
"entropy": 5.770703077316284,
|
|
"epoch": 0.662045788699853,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004962383548885039,
|
|
"loss": 5.7265,
|
|
"mean_token_accuracy": 0.15264071598649026,
|
|
"num_tokens": 14543026.0,
|
|
"step": 7880
|
|
},
|
|
{
|
|
"entropy": 5.728618335723877,
|
|
"epoch": 0.6624658685150179,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004962328999148075,
|
|
"loss": 5.5953,
|
|
"mean_token_accuracy": 0.1570552945137024,
|
|
"num_tokens": 14552068.0,
|
|
"step": 7885
|
|
},
|
|
{
|
|
"entropy": 5.7635719776153564,
|
|
"epoch": 0.6628859483301828,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004962274410220842,
|
|
"loss": 5.7341,
|
|
"mean_token_accuracy": 0.14789471104741098,
|
|
"num_tokens": 14561587.0,
|
|
"step": 7890
|
|
},
|
|
{
|
|
"entropy": 5.859791803359985,
|
|
"epoch": 0.6633060281453477,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004962219782104308,
|
|
"loss": 5.7256,
|
|
"mean_token_accuracy": 0.16066627949476242,
|
|
"num_tokens": 14571020.0,
|
|
"step": 7895
|
|
},
|
|
{
|
|
"entropy": 5.814104175567627,
|
|
"epoch": 0.6637261079605125,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004962165114799439,
|
|
"loss": 5.6841,
|
|
"mean_token_accuracy": 0.1433543421328068,
|
|
"num_tokens": 14580638.0,
|
|
"step": 7900
|
|
},
|
|
{
|
|
"entropy": 5.693702554702758,
|
|
"epoch": 0.6641461877756774,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004962110408307204,
|
|
"loss": 5.6214,
|
|
"mean_token_accuracy": 0.15168005228042603,
|
|
"num_tokens": 14590173.0,
|
|
"step": 7905
|
|
},
|
|
{
|
|
"entropy": 5.696859216690063,
|
|
"epoch": 0.6645662675908423,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004962055662628571,
|
|
"loss": 5.5978,
|
|
"mean_token_accuracy": 0.15115945935249328,
|
|
"num_tokens": 14598635.0,
|
|
"step": 7910
|
|
},
|
|
{
|
|
"entropy": 5.789608335494995,
|
|
"epoch": 0.6649863474060071,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004962000877764513,
|
|
"loss": 5.6362,
|
|
"mean_token_accuracy": 0.15672800540924073,
|
|
"num_tokens": 14607233.0,
|
|
"step": 7915
|
|
},
|
|
{
|
|
"entropy": 5.904106950759887,
|
|
"epoch": 0.665406427221172,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004961946053715998,
|
|
"loss": 5.8042,
|
|
"mean_token_accuracy": 0.14238024279475212,
|
|
"num_tokens": 14617483.0,
|
|
"step": 7920
|
|
},
|
|
{
|
|
"entropy": 5.73907699584961,
|
|
"epoch": 0.665826507036337,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004961891190483997,
|
|
"loss": 5.6099,
|
|
"mean_token_accuracy": 0.15576208233833314,
|
|
"num_tokens": 14625805.0,
|
|
"step": 7925
|
|
},
|
|
{
|
|
"entropy": 5.668482398986816,
|
|
"epoch": 0.6662465868515017,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004961836288069483,
|
|
"loss": 5.5303,
|
|
"mean_token_accuracy": 0.15367640256881715,
|
|
"num_tokens": 14634605.0,
|
|
"step": 7930
|
|
},
|
|
{
|
|
"entropy": 5.802455711364746,
|
|
"epoch": 0.6666666666666666,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004961781346473428,
|
|
"loss": 5.7424,
|
|
"mean_token_accuracy": 0.1421908602118492,
|
|
"num_tokens": 14644970.0,
|
|
"step": 7935
|
|
},
|
|
{
|
|
"entropy": 5.835074090957642,
|
|
"epoch": 0.6670867464818315,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004961726365696805,
|
|
"loss": 5.6277,
|
|
"mean_token_accuracy": 0.15110262408852576,
|
|
"num_tokens": 14655043.0,
|
|
"step": 7940
|
|
},
|
|
{
|
|
"entropy": 5.85212664604187,
|
|
"epoch": 0.6675068262969964,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004961671345740589,
|
|
"loss": 5.6034,
|
|
"mean_token_accuracy": 0.15058621391654015,
|
|
"num_tokens": 14663994.0,
|
|
"step": 7945
|
|
},
|
|
{
|
|
"entropy": 5.6571033000946045,
|
|
"epoch": 0.6679269061121613,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004961616286605753,
|
|
"loss": 5.6122,
|
|
"mean_token_accuracy": 0.1486305832862854,
|
|
"num_tokens": 14674101.0,
|
|
"step": 7950
|
|
},
|
|
{
|
|
"entropy": 5.6964271068573,
|
|
"epoch": 0.6683469859273262,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004961561188293273,
|
|
"loss": 5.6979,
|
|
"mean_token_accuracy": 0.14713332504034043,
|
|
"num_tokens": 14684156.0,
|
|
"step": 7955
|
|
},
|
|
{
|
|
"entropy": 5.670635604858399,
|
|
"epoch": 0.6687670657424911,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004961506050804126,
|
|
"loss": 5.6027,
|
|
"mean_token_accuracy": 0.15685120671987535,
|
|
"num_tokens": 14693223.0,
|
|
"step": 7960
|
|
},
|
|
{
|
|
"entropy": 5.775451517105102,
|
|
"epoch": 0.6691871455576559,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.000496145087413929,
|
|
"loss": 5.604,
|
|
"mean_token_accuracy": 0.15279402881860732,
|
|
"num_tokens": 14702959.0,
|
|
"step": 7965
|
|
},
|
|
{
|
|
"entropy": 5.8267169952392575,
|
|
"epoch": 0.6696072253728208,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004961395658299737,
|
|
"loss": 5.7168,
|
|
"mean_token_accuracy": 0.14913576021790503,
|
|
"num_tokens": 14712146.0,
|
|
"step": 7970
|
|
},
|
|
{
|
|
"entropy": 5.767063426971435,
|
|
"epoch": 0.6700273051879857,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004961340403286451,
|
|
"loss": 5.6427,
|
|
"mean_token_accuracy": 0.1493378534913063,
|
|
"num_tokens": 14721932.0,
|
|
"step": 7975
|
|
},
|
|
{
|
|
"entropy": 5.712933492660523,
|
|
"epoch": 0.6704473850031506,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004961285109100408,
|
|
"loss": 5.5679,
|
|
"mean_token_accuracy": 0.16175169199705125,
|
|
"num_tokens": 14731080.0,
|
|
"step": 7980
|
|
},
|
|
{
|
|
"entropy": 5.712076044082641,
|
|
"epoch": 0.6708674648183155,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004961229775742587,
|
|
"loss": 5.5804,
|
|
"mean_token_accuracy": 0.16035438925027848,
|
|
"num_tokens": 14740057.0,
|
|
"step": 7985
|
|
},
|
|
{
|
|
"entropy": 5.814006376266479,
|
|
"epoch": 0.6712875446334804,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000496117440321397,
|
|
"loss": 5.6576,
|
|
"mean_token_accuracy": 0.15759425461292267,
|
|
"num_tokens": 14748399.0,
|
|
"step": 7990
|
|
},
|
|
{
|
|
"entropy": 5.742432641983032,
|
|
"epoch": 0.6717076244486453,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004961118991515537,
|
|
"loss": 5.6627,
|
|
"mean_token_accuracy": 0.1502150148153305,
|
|
"num_tokens": 14757215.0,
|
|
"step": 7995
|
|
},
|
|
{
|
|
"entropy": 5.684312534332276,
|
|
"epoch": 0.6721277042638101,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000496106354064827,
|
|
"loss": 5.6795,
|
|
"mean_token_accuracy": 0.15305250138044357,
|
|
"num_tokens": 14766191.0,
|
|
"step": 8000
|
|
},
|
|
{
|
|
"entropy": 5.875341749191284,
|
|
"epoch": 0.672547784078975,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004961008050613149,
|
|
"loss": 5.7247,
|
|
"mean_token_accuracy": 0.1427694909274578,
|
|
"num_tokens": 14775220.0,
|
|
"step": 8005
|
|
},
|
|
{
|
|
"entropy": 5.894955635070801,
|
|
"epoch": 0.6729678638941399,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004960952521411161,
|
|
"loss": 5.6944,
|
|
"mean_token_accuracy": 0.14855958074331282,
|
|
"num_tokens": 14784287.0,
|
|
"step": 8010
|
|
},
|
|
{
|
|
"entropy": 5.868842029571534,
|
|
"epoch": 0.6733879437093048,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004960896953043287,
|
|
"loss": 5.7613,
|
|
"mean_token_accuracy": 0.14570847973227502,
|
|
"num_tokens": 14794219.0,
|
|
"step": 8015
|
|
},
|
|
{
|
|
"entropy": 5.783145427703857,
|
|
"epoch": 0.6738080235244697,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004960841345510511,
|
|
"loss": 5.6466,
|
|
"mean_token_accuracy": 0.1534015029668808,
|
|
"num_tokens": 14803324.0,
|
|
"step": 8020
|
|
},
|
|
{
|
|
"entropy": 5.758127498626709,
|
|
"epoch": 0.6742281033396346,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000496078569881382,
|
|
"loss": 5.669,
|
|
"mean_token_accuracy": 0.15486637651920318,
|
|
"num_tokens": 14811963.0,
|
|
"step": 8025
|
|
},
|
|
{
|
|
"entropy": 5.706368112564087,
|
|
"epoch": 0.6746481831547995,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004960730012954198,
|
|
"loss": 5.6159,
|
|
"mean_token_accuracy": 0.15371300429105758,
|
|
"num_tokens": 14821903.0,
|
|
"step": 8030
|
|
},
|
|
{
|
|
"entropy": 5.682500696182251,
|
|
"epoch": 0.6750682629699643,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004960674287932634,
|
|
"loss": 5.6036,
|
|
"mean_token_accuracy": 0.15207039266824723,
|
|
"num_tokens": 14831215.0,
|
|
"step": 8035
|
|
},
|
|
{
|
|
"entropy": 5.7689878940582275,
|
|
"epoch": 0.6754883427851291,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004960618523750111,
|
|
"loss": 5.5344,
|
|
"mean_token_accuracy": 0.16025708019733428,
|
|
"num_tokens": 14840354.0,
|
|
"step": 8040
|
|
},
|
|
{
|
|
"entropy": 5.807526636123657,
|
|
"epoch": 0.675908422600294,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.000496056272040762,
|
|
"loss": 5.7256,
|
|
"mean_token_accuracy": 0.1501130685210228,
|
|
"num_tokens": 14849660.0,
|
|
"step": 8045
|
|
},
|
|
{
|
|
"entropy": 5.7956421852111815,
|
|
"epoch": 0.6763285024154589,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004960506877906149,
|
|
"loss": 5.6257,
|
|
"mean_token_accuracy": 0.14838283210992814,
|
|
"num_tokens": 14859819.0,
|
|
"step": 8050
|
|
},
|
|
{
|
|
"entropy": 5.757112693786621,
|
|
"epoch": 0.6767485822306238,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004960450996246686,
|
|
"loss": 5.6388,
|
|
"mean_token_accuracy": 0.15627321898937224,
|
|
"num_tokens": 14869260.0,
|
|
"step": 8055
|
|
},
|
|
{
|
|
"entropy": 5.7164520740509035,
|
|
"epoch": 0.6771686620457887,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004960395075430222,
|
|
"loss": 5.614,
|
|
"mean_token_accuracy": 0.15564015433192252,
|
|
"num_tokens": 14878685.0,
|
|
"step": 8060
|
|
},
|
|
{
|
|
"entropy": 5.777329826354981,
|
|
"epoch": 0.6775887418609536,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004960339115457748,
|
|
"loss": 5.6172,
|
|
"mean_token_accuracy": 0.1507098227739334,
|
|
"num_tokens": 14888456.0,
|
|
"step": 8065
|
|
},
|
|
{
|
|
"entropy": 5.80708589553833,
|
|
"epoch": 0.6780088216761184,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004960283116330255,
|
|
"loss": 5.7205,
|
|
"mean_token_accuracy": 0.14664217829704285,
|
|
"num_tokens": 14897401.0,
|
|
"step": 8070
|
|
},
|
|
{
|
|
"entropy": 5.84255633354187,
|
|
"epoch": 0.6784289014912833,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004960227078048735,
|
|
"loss": 5.6226,
|
|
"mean_token_accuracy": 0.15697504729032516,
|
|
"num_tokens": 14906741.0,
|
|
"step": 8075
|
|
},
|
|
{
|
|
"entropy": 5.720810079574585,
|
|
"epoch": 0.6788489813064482,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004960171000614179,
|
|
"loss": 5.5069,
|
|
"mean_token_accuracy": 0.16248478889465331,
|
|
"num_tokens": 14916002.0,
|
|
"step": 8080
|
|
},
|
|
{
|
|
"entropy": 5.585197591781617,
|
|
"epoch": 0.6792690611216131,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004960114884027583,
|
|
"loss": 5.4659,
|
|
"mean_token_accuracy": 0.1668694093823433,
|
|
"num_tokens": 14925247.0,
|
|
"step": 8085
|
|
},
|
|
{
|
|
"entropy": 5.697068214416504,
|
|
"epoch": 0.679689140936778,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004960058728289939,
|
|
"loss": 5.5785,
|
|
"mean_token_accuracy": 0.1530070647597313,
|
|
"num_tokens": 14933925.0,
|
|
"step": 8090
|
|
},
|
|
{
|
|
"entropy": 5.845986604690552,
|
|
"epoch": 0.6801092207519429,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004960002533402243,
|
|
"loss": 5.6551,
|
|
"mean_token_accuracy": 0.1536601111292839,
|
|
"num_tokens": 14943368.0,
|
|
"step": 8095
|
|
},
|
|
{
|
|
"entropy": 5.7944434642791744,
|
|
"epoch": 0.6805293005671077,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004959946299365491,
|
|
"loss": 5.6835,
|
|
"mean_token_accuracy": 0.147177092730999,
|
|
"num_tokens": 14953710.0,
|
|
"step": 8100
|
|
},
|
|
{
|
|
"entropy": 5.828194952011108,
|
|
"epoch": 0.6809493803822726,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004959890026180677,
|
|
"loss": 5.691,
|
|
"mean_token_accuracy": 0.1521795153617859,
|
|
"num_tokens": 14962814.0,
|
|
"step": 8105
|
|
},
|
|
{
|
|
"entropy": 5.653000354766846,
|
|
"epoch": 0.6813694601974375,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00049598337138488,
|
|
"loss": 5.5758,
|
|
"mean_token_accuracy": 0.15957341492176055,
|
|
"num_tokens": 14971631.0,
|
|
"step": 8110
|
|
},
|
|
{
|
|
"entropy": 5.725569152832032,
|
|
"epoch": 0.6817895400126024,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004959777362370855,
|
|
"loss": 5.5609,
|
|
"mean_token_accuracy": 0.1567662850022316,
|
|
"num_tokens": 14980528.0,
|
|
"step": 8115
|
|
},
|
|
{
|
|
"entropy": 5.756756782531738,
|
|
"epoch": 0.6822096198277673,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004959720971747843,
|
|
"loss": 5.5812,
|
|
"mean_token_accuracy": 0.15582162886857986,
|
|
"num_tokens": 14989331.0,
|
|
"step": 8120
|
|
},
|
|
{
|
|
"entropy": 5.713353538513184,
|
|
"epoch": 0.6826296996429322,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004959664541980762,
|
|
"loss": 5.5831,
|
|
"mean_token_accuracy": 0.15746317207813262,
|
|
"num_tokens": 14999403.0,
|
|
"step": 8125
|
|
},
|
|
{
|
|
"entropy": 5.753393220901489,
|
|
"epoch": 0.6830497794580971,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004959608073070612,
|
|
"loss": 5.6785,
|
|
"mean_token_accuracy": 0.14850740507245064,
|
|
"num_tokens": 15009388.0,
|
|
"step": 8130
|
|
},
|
|
{
|
|
"entropy": 5.798916053771973,
|
|
"epoch": 0.6834698592732619,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004959551565018392,
|
|
"loss": 5.5994,
|
|
"mean_token_accuracy": 0.1609173148870468,
|
|
"num_tokens": 15018586.0,
|
|
"step": 8135
|
|
},
|
|
{
|
|
"entropy": 5.737577104568482,
|
|
"epoch": 0.6838899390884268,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004959495017825104,
|
|
"loss": 5.611,
|
|
"mean_token_accuracy": 0.1578981950879097,
|
|
"num_tokens": 15027982.0,
|
|
"step": 8140
|
|
},
|
|
{
|
|
"entropy": 5.684477043151856,
|
|
"epoch": 0.6843100189035917,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004959438431491749,
|
|
"loss": 5.6098,
|
|
"mean_token_accuracy": 0.1577099531888962,
|
|
"num_tokens": 15037103.0,
|
|
"step": 8145
|
|
},
|
|
{
|
|
"entropy": 5.697509860992431,
|
|
"epoch": 0.6847300987187566,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.000495938180601933,
|
|
"loss": 5.7013,
|
|
"mean_token_accuracy": 0.1484990067780018,
|
|
"num_tokens": 15046739.0,
|
|
"step": 8150
|
|
},
|
|
{
|
|
"entropy": 5.794940948486328,
|
|
"epoch": 0.6851501785339215,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004959325141408851,
|
|
"loss": 5.6373,
|
|
"mean_token_accuracy": 0.15557540208101273,
|
|
"num_tokens": 15056586.0,
|
|
"step": 8155
|
|
},
|
|
{
|
|
"entropy": 5.724914121627807,
|
|
"epoch": 0.6855702583490864,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004959268437661313,
|
|
"loss": 5.6297,
|
|
"mean_token_accuracy": 0.15379536151885986,
|
|
"num_tokens": 15066622.0,
|
|
"step": 8160
|
|
},
|
|
{
|
|
"entropy": 5.794011211395263,
|
|
"epoch": 0.6859903381642513,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004959211694777724,
|
|
"loss": 5.6106,
|
|
"mean_token_accuracy": 0.15808048397302626,
|
|
"num_tokens": 15075415.0,
|
|
"step": 8165
|
|
},
|
|
{
|
|
"entropy": 5.751935195922852,
|
|
"epoch": 0.686410417979416,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004959154912759086,
|
|
"loss": 5.59,
|
|
"mean_token_accuracy": 0.15190571993589402,
|
|
"num_tokens": 15085087.0,
|
|
"step": 8170
|
|
},
|
|
{
|
|
"entropy": 5.736638736724854,
|
|
"epoch": 0.6868304977945809,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004959098091606406,
|
|
"loss": 5.6039,
|
|
"mean_token_accuracy": 0.15554135516285897,
|
|
"num_tokens": 15093580.0,
|
|
"step": 8175
|
|
},
|
|
{
|
|
"entropy": 5.630563879013062,
|
|
"epoch": 0.6872505776097458,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004959041231320692,
|
|
"loss": 5.5887,
|
|
"mean_token_accuracy": 0.16016243845224382,
|
|
"num_tokens": 15104033.0,
|
|
"step": 8180
|
|
},
|
|
{
|
|
"entropy": 5.761919355392456,
|
|
"epoch": 0.6876706574249107,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004958984331902951,
|
|
"loss": 5.6592,
|
|
"mean_token_accuracy": 0.14851288944482804,
|
|
"num_tokens": 15113164.0,
|
|
"step": 8185
|
|
},
|
|
{
|
|
"entropy": 5.72430567741394,
|
|
"epoch": 0.6880907372400756,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004958927393354188,
|
|
"loss": 5.6022,
|
|
"mean_token_accuracy": 0.15889295786619187,
|
|
"num_tokens": 15122215.0,
|
|
"step": 8190
|
|
},
|
|
{
|
|
"entropy": 5.736812925338745,
|
|
"epoch": 0.6885108170552405,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004958870415675415,
|
|
"loss": 5.5838,
|
|
"mean_token_accuracy": 0.15310561507940293,
|
|
"num_tokens": 15130877.0,
|
|
"step": 8195
|
|
},
|
|
{
|
|
"entropy": 5.734648513793945,
|
|
"epoch": 0.6889308968704054,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004958813398867639,
|
|
"loss": 5.5748,
|
|
"mean_token_accuracy": 0.1602252170443535,
|
|
"num_tokens": 15140227.0,
|
|
"step": 8200
|
|
},
|
|
{
|
|
"entropy": 5.8443381786346436,
|
|
"epoch": 0.6893509766855702,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004958756342931872,
|
|
"loss": 5.7447,
|
|
"mean_token_accuracy": 0.1436396934092045,
|
|
"num_tokens": 15150006.0,
|
|
"step": 8205
|
|
},
|
|
{
|
|
"entropy": 5.749261903762817,
|
|
"epoch": 0.6897710565007351,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004958699247869122,
|
|
"loss": 5.6456,
|
|
"mean_token_accuracy": 0.15174706876277924,
|
|
"num_tokens": 15160032.0,
|
|
"step": 8210
|
|
},
|
|
{
|
|
"entropy": 5.75321626663208,
|
|
"epoch": 0.6901911363159,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004958642113680404,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.16214127987623214,
|
|
"num_tokens": 15168966.0,
|
|
"step": 8215
|
|
},
|
|
{
|
|
"entropy": 5.870010280609131,
|
|
"epoch": 0.6906112161310649,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004958584940366727,
|
|
"loss": 5.7572,
|
|
"mean_token_accuracy": 0.14761550426483155,
|
|
"num_tokens": 15179337.0,
|
|
"step": 8220
|
|
},
|
|
{
|
|
"entropy": 5.803754615783691,
|
|
"epoch": 0.6910312959462298,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004958527727929106,
|
|
"loss": 5.6701,
|
|
"mean_token_accuracy": 0.1573884293437004,
|
|
"num_tokens": 15188395.0,
|
|
"step": 8225
|
|
},
|
|
{
|
|
"entropy": 5.746722412109375,
|
|
"epoch": 0.6914513757613947,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004958470476368552,
|
|
"loss": 5.6025,
|
|
"mean_token_accuracy": 0.16065260544419288,
|
|
"num_tokens": 15198669.0,
|
|
"step": 8230
|
|
},
|
|
{
|
|
"entropy": 5.7148209571838375,
|
|
"epoch": 0.6918714555765595,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004958413185686082,
|
|
"loss": 5.6004,
|
|
"mean_token_accuracy": 0.15373986065387726,
|
|
"num_tokens": 15207371.0,
|
|
"step": 8235
|
|
},
|
|
{
|
|
"entropy": 5.766255283355713,
|
|
"epoch": 0.6922915353917244,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004958355855882709,
|
|
"loss": 5.6358,
|
|
"mean_token_accuracy": 0.1554594576358795,
|
|
"num_tokens": 15215694.0,
|
|
"step": 8240
|
|
},
|
|
{
|
|
"entropy": 5.766929721832275,
|
|
"epoch": 0.6927116152068893,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000495829848695945,
|
|
"loss": 5.6244,
|
|
"mean_token_accuracy": 0.15312992185354232,
|
|
"num_tokens": 15224963.0,
|
|
"step": 8245
|
|
},
|
|
{
|
|
"entropy": 5.648315715789795,
|
|
"epoch": 0.6931316950220542,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.000495824107891732,
|
|
"loss": 5.43,
|
|
"mean_token_accuracy": 0.1609826758503914,
|
|
"num_tokens": 15233569.0,
|
|
"step": 8250
|
|
},
|
|
{
|
|
"entropy": 5.702505493164063,
|
|
"epoch": 0.6935517748372191,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004958183631757336,
|
|
"loss": 5.6174,
|
|
"mean_token_accuracy": 0.1533745378255844,
|
|
"num_tokens": 15242671.0,
|
|
"step": 8255
|
|
},
|
|
{
|
|
"entropy": 5.715704250335693,
|
|
"epoch": 0.693971854652384,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004958126145480517,
|
|
"loss": 5.5655,
|
|
"mean_token_accuracy": 0.1603790357708931,
|
|
"num_tokens": 15251698.0,
|
|
"step": 8260
|
|
},
|
|
{
|
|
"entropy": 5.793996429443359,
|
|
"epoch": 0.6943919344675489,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004958068620087879,
|
|
"loss": 5.7033,
|
|
"mean_token_accuracy": 0.15312060862779617,
|
|
"num_tokens": 15260608.0,
|
|
"step": 8265
|
|
},
|
|
{
|
|
"entropy": 5.725742197036743,
|
|
"epoch": 0.6948120142827137,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004958011055580443,
|
|
"loss": 5.5631,
|
|
"mean_token_accuracy": 0.15500877648591996,
|
|
"num_tokens": 15268866.0,
|
|
"step": 8270
|
|
},
|
|
{
|
|
"entropy": 5.648518466949463,
|
|
"epoch": 0.6952320940978786,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004957953451959229,
|
|
"loss": 5.5272,
|
|
"mean_token_accuracy": 0.16275469362735748,
|
|
"num_tokens": 15277600.0,
|
|
"step": 8275
|
|
},
|
|
{
|
|
"entropy": 5.730481147766113,
|
|
"epoch": 0.6956521739130435,
|
|
"grad_norm": 1.296875,
|
|
"learning_rate": 0.0004957895809225254,
|
|
"loss": 5.5546,
|
|
"mean_token_accuracy": 0.15924442559480667,
|
|
"num_tokens": 15286016.0,
|
|
"step": 8280
|
|
},
|
|
{
|
|
"entropy": 5.750138473510742,
|
|
"epoch": 0.6960722537282084,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957838127379544,
|
|
"loss": 5.6063,
|
|
"mean_token_accuracy": 0.1584298923611641,
|
|
"num_tokens": 15294676.0,
|
|
"step": 8285
|
|
},
|
|
{
|
|
"entropy": 5.749259424209595,
|
|
"epoch": 0.6964923335433733,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004957780406423118,
|
|
"loss": 5.5886,
|
|
"mean_token_accuracy": 0.1535394623875618,
|
|
"num_tokens": 15304084.0,
|
|
"step": 8290
|
|
},
|
|
{
|
|
"entropy": 5.743555212020874,
|
|
"epoch": 0.6969124133585382,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004957722646356999,
|
|
"loss": 5.5989,
|
|
"mean_token_accuracy": 0.15085933804512025,
|
|
"num_tokens": 15314182.0,
|
|
"step": 8295
|
|
},
|
|
{
|
|
"entropy": 5.774941158294678,
|
|
"epoch": 0.697332493173703,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004957664847182209,
|
|
"loss": 5.7173,
|
|
"mean_token_accuracy": 0.14531809762120246,
|
|
"num_tokens": 15324213.0,
|
|
"step": 8300
|
|
},
|
|
{
|
|
"entropy": 5.836635112762451,
|
|
"epoch": 0.6977525729888678,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004957607008899774,
|
|
"loss": 5.6305,
|
|
"mean_token_accuracy": 0.14892250299453735,
|
|
"num_tokens": 15333122.0,
|
|
"step": 8305
|
|
},
|
|
{
|
|
"entropy": 5.788960552215576,
|
|
"epoch": 0.6981726528040327,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004957549131510717,
|
|
"loss": 5.7278,
|
|
"mean_token_accuracy": 0.14765103310346603,
|
|
"num_tokens": 15342199.0,
|
|
"step": 8310
|
|
},
|
|
{
|
|
"entropy": 5.840366983413697,
|
|
"epoch": 0.6985927326191976,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004957491215016065,
|
|
"loss": 5.6792,
|
|
"mean_token_accuracy": 0.15214879661798478,
|
|
"num_tokens": 15352463.0,
|
|
"step": 8315
|
|
},
|
|
{
|
|
"entropy": 5.700908565521241,
|
|
"epoch": 0.6990128124343625,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004957433259416841,
|
|
"loss": 5.5231,
|
|
"mean_token_accuracy": 0.1583988130092621,
|
|
"num_tokens": 15361815.0,
|
|
"step": 8320
|
|
},
|
|
{
|
|
"entropy": 5.767258024215698,
|
|
"epoch": 0.6994328922495274,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004957375264714075,
|
|
"loss": 5.6439,
|
|
"mean_token_accuracy": 0.14546388015151024,
|
|
"num_tokens": 15371773.0,
|
|
"step": 8325
|
|
},
|
|
{
|
|
"entropy": 5.697303724288941,
|
|
"epoch": 0.6998529720646923,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004957317230908792,
|
|
"loss": 5.5916,
|
|
"mean_token_accuracy": 0.15627994984388352,
|
|
"num_tokens": 15380881.0,
|
|
"step": 8330
|
|
},
|
|
{
|
|
"entropy": 5.672244882583618,
|
|
"epoch": 0.7002730518798572,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004957259158002022,
|
|
"loss": 5.4522,
|
|
"mean_token_accuracy": 0.16274693012237548,
|
|
"num_tokens": 15389310.0,
|
|
"step": 8335
|
|
},
|
|
{
|
|
"entropy": 5.663344621658325,
|
|
"epoch": 0.700693131695022,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004957201045994791,
|
|
"loss": 5.5548,
|
|
"mean_token_accuracy": 0.15098515748977662,
|
|
"num_tokens": 15398584.0,
|
|
"step": 8340
|
|
},
|
|
{
|
|
"entropy": 5.729818964004517,
|
|
"epoch": 0.7011132115101869,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004957142894888131,
|
|
"loss": 5.5935,
|
|
"mean_token_accuracy": 0.16061321198940276,
|
|
"num_tokens": 15407208.0,
|
|
"step": 8345
|
|
},
|
|
{
|
|
"entropy": 5.745049524307251,
|
|
"epoch": 0.7015332913253518,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004957084704683071,
|
|
"loss": 5.639,
|
|
"mean_token_accuracy": 0.15271784067153932,
|
|
"num_tokens": 15416474.0,
|
|
"step": 8350
|
|
},
|
|
{
|
|
"entropy": 5.730433702468872,
|
|
"epoch": 0.7019533711405167,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004957026475380642,
|
|
"loss": 5.6296,
|
|
"mean_token_accuracy": 0.15648382306098937,
|
|
"num_tokens": 15426101.0,
|
|
"step": 8355
|
|
},
|
|
{
|
|
"entropy": 5.7902411937713625,
|
|
"epoch": 0.7023734509556816,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004956968206981875,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.15503493845462799,
|
|
"num_tokens": 15435910.0,
|
|
"step": 8360
|
|
},
|
|
{
|
|
"entropy": 5.79659628868103,
|
|
"epoch": 0.7027935307708465,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004956909899487803,
|
|
"loss": 5.6948,
|
|
"mean_token_accuracy": 0.1535898119211197,
|
|
"num_tokens": 15445494.0,
|
|
"step": 8365
|
|
},
|
|
{
|
|
"entropy": 5.741454792022705,
|
|
"epoch": 0.7032136105860114,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004956851552899459,
|
|
"loss": 5.5931,
|
|
"mean_token_accuracy": 0.16135696619749068,
|
|
"num_tokens": 15455332.0,
|
|
"step": 8370
|
|
},
|
|
{
|
|
"entropy": 5.763624382019043,
|
|
"epoch": 0.7036336904011762,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004956793167217874,
|
|
"loss": 5.65,
|
|
"mean_token_accuracy": 0.1525983467698097,
|
|
"num_tokens": 15464241.0,
|
|
"step": 8375
|
|
},
|
|
{
|
|
"entropy": 5.82790732383728,
|
|
"epoch": 0.7040537702163411,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004956734742444087,
|
|
"loss": 5.6567,
|
|
"mean_token_accuracy": 0.1559632509946823,
|
|
"num_tokens": 15473473.0,
|
|
"step": 8380
|
|
},
|
|
{
|
|
"entropy": 5.716184997558594,
|
|
"epoch": 0.704473850031506,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004956676278579129,
|
|
"loss": 5.5329,
|
|
"mean_token_accuracy": 0.16085591912269592,
|
|
"num_tokens": 15482494.0,
|
|
"step": 8385
|
|
},
|
|
{
|
|
"entropy": 5.663648986816407,
|
|
"epoch": 0.7048939298466709,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004956617775624037,
|
|
"loss": 5.5538,
|
|
"mean_token_accuracy": 0.15378274619579316,
|
|
"num_tokens": 15491180.0,
|
|
"step": 8390
|
|
},
|
|
{
|
|
"entropy": 5.7593092918396,
|
|
"epoch": 0.7053140096618358,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004956559233579848,
|
|
"loss": 5.6075,
|
|
"mean_token_accuracy": 0.15238080322742462,
|
|
"num_tokens": 15501035.0,
|
|
"step": 8395
|
|
},
|
|
{
|
|
"entropy": 5.747204065322876,
|
|
"epoch": 0.7057340894770007,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004956500652447598,
|
|
"loss": 5.5831,
|
|
"mean_token_accuracy": 0.15643986463546752,
|
|
"num_tokens": 15510191.0,
|
|
"step": 8400
|
|
},
|
|
{
|
|
"entropy": 5.691559076309204,
|
|
"epoch": 0.7061541692921655,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004956442032228324,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.15496474727988244,
|
|
"num_tokens": 15519253.0,
|
|
"step": 8405
|
|
},
|
|
{
|
|
"entropy": 5.722681283950806,
|
|
"epoch": 0.7065742491073304,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004956383372923067,
|
|
"loss": 5.627,
|
|
"mean_token_accuracy": 0.15085585564374923,
|
|
"num_tokens": 15528348.0,
|
|
"step": 8410
|
|
},
|
|
{
|
|
"entropy": 5.935694408416748,
|
|
"epoch": 0.7069943289224953,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004956324674532864,
|
|
"loss": 5.7185,
|
|
"mean_token_accuracy": 0.14439277797937394,
|
|
"num_tokens": 15537557.0,
|
|
"step": 8415
|
|
},
|
|
{
|
|
"entropy": 5.8467813491821286,
|
|
"epoch": 0.7074144087376601,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004956265937058757,
|
|
"loss": 5.654,
|
|
"mean_token_accuracy": 0.15044144690036773,
|
|
"num_tokens": 15546745.0,
|
|
"step": 8420
|
|
},
|
|
{
|
|
"entropy": 5.744271945953369,
|
|
"epoch": 0.707834488552825,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004956207160501784,
|
|
"loss": 5.5384,
|
|
"mean_token_accuracy": 0.16095120459794998,
|
|
"num_tokens": 15555532.0,
|
|
"step": 8425
|
|
},
|
|
{
|
|
"entropy": 5.7119933605194095,
|
|
"epoch": 0.70825456836799,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004956148344862987,
|
|
"loss": 5.5878,
|
|
"mean_token_accuracy": 0.16213922202587128,
|
|
"num_tokens": 15564189.0,
|
|
"step": 8430
|
|
},
|
|
{
|
|
"entropy": 5.626652574539184,
|
|
"epoch": 0.7086746481831548,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004956089490143408,
|
|
"loss": 5.6232,
|
|
"mean_token_accuracy": 0.15468233972787857,
|
|
"num_tokens": 15574116.0,
|
|
"step": 8435
|
|
},
|
|
{
|
|
"entropy": 5.845960283279419,
|
|
"epoch": 0.7090947279983196,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004956030596344089,
|
|
"loss": 5.6162,
|
|
"mean_token_accuracy": 0.15749153792858123,
|
|
"num_tokens": 15583031.0,
|
|
"step": 8440
|
|
},
|
|
{
|
|
"entropy": 5.82060866355896,
|
|
"epoch": 0.7095148078134845,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004955971663466075,
|
|
"loss": 5.7384,
|
|
"mean_token_accuracy": 0.1541066735982895,
|
|
"num_tokens": 15592576.0,
|
|
"step": 8445
|
|
},
|
|
{
|
|
"entropy": 5.800911283493042,
|
|
"epoch": 0.7099348876286494,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004955912691510407,
|
|
"loss": 5.6751,
|
|
"mean_token_accuracy": 0.15478230714797975,
|
|
"num_tokens": 15601065.0,
|
|
"step": 8450
|
|
},
|
|
{
|
|
"entropy": 5.75335373878479,
|
|
"epoch": 0.7103549674438143,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004955853680478134,
|
|
"loss": 5.5987,
|
|
"mean_token_accuracy": 0.1501637101173401,
|
|
"num_tokens": 15610112.0,
|
|
"step": 8455
|
|
},
|
|
{
|
|
"entropy": 5.803425455093384,
|
|
"epoch": 0.7107750472589792,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004955794630370297,
|
|
"loss": 5.5779,
|
|
"mean_token_accuracy": 0.15560339987277985,
|
|
"num_tokens": 15618890.0,
|
|
"step": 8460
|
|
},
|
|
{
|
|
"entropy": 5.72348952293396,
|
|
"epoch": 0.7111951270741441,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004955735541187945,
|
|
"loss": 5.6142,
|
|
"mean_token_accuracy": 0.15508751571178436,
|
|
"num_tokens": 15627678.0,
|
|
"step": 8465
|
|
},
|
|
{
|
|
"entropy": 5.737122440338135,
|
|
"epoch": 0.711615206889309,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004955676412932124,
|
|
"loss": 5.5994,
|
|
"mean_token_accuracy": 0.15686758160591124,
|
|
"num_tokens": 15636833.0,
|
|
"step": 8470
|
|
},
|
|
{
|
|
"entropy": 5.708105039596558,
|
|
"epoch": 0.7120352867044738,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004955617245603881,
|
|
"loss": 5.6109,
|
|
"mean_token_accuracy": 0.1494527980685234,
|
|
"num_tokens": 15646571.0,
|
|
"step": 8475
|
|
},
|
|
{
|
|
"entropy": 5.771750783920288,
|
|
"epoch": 0.7124553665196387,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004955558039204263,
|
|
"loss": 5.6511,
|
|
"mean_token_accuracy": 0.16095416396856307,
|
|
"num_tokens": 15654907.0,
|
|
"step": 8480
|
|
},
|
|
{
|
|
"entropy": 5.806697988510132,
|
|
"epoch": 0.7128754463348036,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004955498793734321,
|
|
"loss": 5.5932,
|
|
"mean_token_accuracy": 0.15864523649215698,
|
|
"num_tokens": 15664336.0,
|
|
"step": 8485
|
|
},
|
|
{
|
|
"entropy": 5.8073060512542725,
|
|
"epoch": 0.7132955261499685,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004955439509195103,
|
|
"loss": 5.6601,
|
|
"mean_token_accuracy": 0.15415582954883575,
|
|
"num_tokens": 15674000.0,
|
|
"step": 8490
|
|
},
|
|
{
|
|
"entropy": 5.826253652572632,
|
|
"epoch": 0.7137156059651334,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004955380185587661,
|
|
"loss": 5.6496,
|
|
"mean_token_accuracy": 0.15672145187854766,
|
|
"num_tokens": 15684214.0,
|
|
"step": 8495
|
|
},
|
|
{
|
|
"entropy": 5.762752962112427,
|
|
"epoch": 0.7141356857802983,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004955320822913043,
|
|
"loss": 5.6571,
|
|
"mean_token_accuracy": 0.1523528926074505,
|
|
"num_tokens": 15693546.0,
|
|
"step": 8500
|
|
},
|
|
{
|
|
"entropy": 5.711481142044067,
|
|
"epoch": 0.7145557655954632,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004955261421172302,
|
|
"loss": 5.5738,
|
|
"mean_token_accuracy": 0.15294577926397324,
|
|
"num_tokens": 15702310.0,
|
|
"step": 8505
|
|
},
|
|
{
|
|
"entropy": 5.757646560668945,
|
|
"epoch": 0.714975845410628,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004955201980366493,
|
|
"loss": 5.647,
|
|
"mean_token_accuracy": 0.154372838139534,
|
|
"num_tokens": 15711544.0,
|
|
"step": 8510
|
|
},
|
|
{
|
|
"entropy": 5.690837240219116,
|
|
"epoch": 0.7153959252257929,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004955142500496665,
|
|
"loss": 5.5216,
|
|
"mean_token_accuracy": 0.15888839215040207,
|
|
"num_tokens": 15720914.0,
|
|
"step": 8515
|
|
},
|
|
{
|
|
"entropy": 5.809607696533203,
|
|
"epoch": 0.7158160050409578,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004955082981563872,
|
|
"loss": 5.6077,
|
|
"mean_token_accuracy": 0.1486271560192108,
|
|
"num_tokens": 15729825.0,
|
|
"step": 8520
|
|
},
|
|
{
|
|
"entropy": 5.724713563919067,
|
|
"epoch": 0.7162360848561227,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000495502342356917,
|
|
"loss": 5.6217,
|
|
"mean_token_accuracy": 0.1557765081524849,
|
|
"num_tokens": 15739649.0,
|
|
"step": 8525
|
|
},
|
|
{
|
|
"entropy": 5.726509523391724,
|
|
"epoch": 0.7166561646712876,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004954963826513614,
|
|
"loss": 5.5154,
|
|
"mean_token_accuracy": 0.1566923290491104,
|
|
"num_tokens": 15747805.0,
|
|
"step": 8530
|
|
},
|
|
{
|
|
"entropy": 5.80653567314148,
|
|
"epoch": 0.7170762444864525,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.000495490419039826,
|
|
"loss": 5.6602,
|
|
"mean_token_accuracy": 0.15109436362981796,
|
|
"num_tokens": 15757267.0,
|
|
"step": 8535
|
|
},
|
|
{
|
|
"entropy": 5.7422764778137205,
|
|
"epoch": 0.7174963243016174,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004954844515224162,
|
|
"loss": 5.6074,
|
|
"mean_token_accuracy": 0.1531700074672699,
|
|
"num_tokens": 15767412.0,
|
|
"step": 8540
|
|
},
|
|
{
|
|
"entropy": 5.669848108291626,
|
|
"epoch": 0.7179164041167821,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004954784800992379,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.1509648084640503,
|
|
"num_tokens": 15776813.0,
|
|
"step": 8545
|
|
},
|
|
{
|
|
"entropy": 5.8059522151947025,
|
|
"epoch": 0.718336483931947,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004954725047703969,
|
|
"loss": 5.6531,
|
|
"mean_token_accuracy": 0.15384986773133277,
|
|
"num_tokens": 15786258.0,
|
|
"step": 8550
|
|
},
|
|
{
|
|
"entropy": 5.800448894500732,
|
|
"epoch": 0.7187565637471119,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000495466525535999,
|
|
"loss": 5.6218,
|
|
"mean_token_accuracy": 0.1543135389685631,
|
|
"num_tokens": 15795673.0,
|
|
"step": 8555
|
|
},
|
|
{
|
|
"entropy": 5.757376861572266,
|
|
"epoch": 0.7191766435622768,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004954605423961501,
|
|
"loss": 5.634,
|
|
"mean_token_accuracy": 0.1545005664229393,
|
|
"num_tokens": 15805050.0,
|
|
"step": 8560
|
|
},
|
|
{
|
|
"entropy": 5.695555400848389,
|
|
"epoch": 0.7195967233774417,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004954545553509562,
|
|
"loss": 5.5764,
|
|
"mean_token_accuracy": 0.16470458656549453,
|
|
"num_tokens": 15813347.0,
|
|
"step": 8565
|
|
},
|
|
{
|
|
"entropy": 5.851338243484497,
|
|
"epoch": 0.7200168031926066,
|
|
"grad_norm": 1.2578125,
|
|
"learning_rate": 0.0004954485644005235,
|
|
"loss": 5.7053,
|
|
"mean_token_accuracy": 0.14976215064525605,
|
|
"num_tokens": 15823528.0,
|
|
"step": 8570
|
|
},
|
|
{
|
|
"entropy": 5.810816144943237,
|
|
"epoch": 0.7204368830077714,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004954425695449578,
|
|
"loss": 5.597,
|
|
"mean_token_accuracy": 0.15381426066160203,
|
|
"num_tokens": 15832727.0,
|
|
"step": 8575
|
|
},
|
|
{
|
|
"entropy": 5.766140604019165,
|
|
"epoch": 0.7208569628229363,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004954365707843657,
|
|
"loss": 5.6661,
|
|
"mean_token_accuracy": 0.1463111013174057,
|
|
"num_tokens": 15842402.0,
|
|
"step": 8580
|
|
},
|
|
{
|
|
"entropy": 5.698196029663086,
|
|
"epoch": 0.7212770426381012,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004954305681188531,
|
|
"loss": 5.5389,
|
|
"mean_token_accuracy": 0.1566384069621563,
|
|
"num_tokens": 15850886.0,
|
|
"step": 8585
|
|
},
|
|
{
|
|
"entropy": 5.9173768043518065,
|
|
"epoch": 0.7216971224532661,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004954245615485265,
|
|
"loss": 5.8176,
|
|
"mean_token_accuracy": 0.1536174975335598,
|
|
"num_tokens": 15860093.0,
|
|
"step": 8590
|
|
},
|
|
{
|
|
"entropy": 5.758410358428955,
|
|
"epoch": 0.722117202268431,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004954185510734924,
|
|
"loss": 5.5379,
|
|
"mean_token_accuracy": 0.1607280820608139,
|
|
"num_tokens": 15868681.0,
|
|
"step": 8595
|
|
},
|
|
{
|
|
"entropy": 5.739753103256225,
|
|
"epoch": 0.7225372820835959,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004954125366938571,
|
|
"loss": 5.6224,
|
|
"mean_token_accuracy": 0.1589464083313942,
|
|
"num_tokens": 15878041.0,
|
|
"step": 8600
|
|
},
|
|
{
|
|
"entropy": 5.717496919631958,
|
|
"epoch": 0.7229573618987608,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004954065184097271,
|
|
"loss": 5.6144,
|
|
"mean_token_accuracy": 0.15493135452270507,
|
|
"num_tokens": 15887562.0,
|
|
"step": 8605
|
|
},
|
|
{
|
|
"entropy": 5.765741586685181,
|
|
"epoch": 0.7233774417139256,
|
|
"grad_norm": 2.828125,
|
|
"learning_rate": 0.0004954004962212092,
|
|
"loss": 5.5471,
|
|
"mean_token_accuracy": 0.16325860619544982,
|
|
"num_tokens": 15896480.0,
|
|
"step": 8610
|
|
},
|
|
{
|
|
"entropy": 5.920656204223633,
|
|
"epoch": 0.7237975215290905,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004953944701284101,
|
|
"loss": 5.7553,
|
|
"mean_token_accuracy": 0.14824794083833695,
|
|
"num_tokens": 15906743.0,
|
|
"step": 8615
|
|
},
|
|
{
|
|
"entropy": 5.760147953033448,
|
|
"epoch": 0.7242176013442554,
|
|
"grad_norm": 2.859375,
|
|
"learning_rate": 0.0004953884401314363,
|
|
"loss": 5.7146,
|
|
"mean_token_accuracy": 0.1402403138577938,
|
|
"num_tokens": 15915981.0,
|
|
"step": 8620
|
|
},
|
|
{
|
|
"entropy": 5.728390407562256,
|
|
"epoch": 0.7246376811594203,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004953824062303949,
|
|
"loss": 5.5455,
|
|
"mean_token_accuracy": 0.15434545278549194,
|
|
"num_tokens": 15924117.0,
|
|
"step": 8625
|
|
},
|
|
{
|
|
"entropy": 5.702328252792358,
|
|
"epoch": 0.7250577609745852,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004953763684253926,
|
|
"loss": 5.5834,
|
|
"mean_token_accuracy": 0.1618655726313591,
|
|
"num_tokens": 15933124.0,
|
|
"step": 8630
|
|
},
|
|
{
|
|
"entropy": 5.671606540679932,
|
|
"epoch": 0.7254778407897501,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004953703267165364,
|
|
"loss": 5.4707,
|
|
"mean_token_accuracy": 0.15732883363962175,
|
|
"num_tokens": 15942422.0,
|
|
"step": 8635
|
|
},
|
|
{
|
|
"entropy": 5.790132761001587,
|
|
"epoch": 0.725897920604915,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004953642811039332,
|
|
"loss": 5.6894,
|
|
"mean_token_accuracy": 0.15469479262828828,
|
|
"num_tokens": 15950989.0,
|
|
"step": 8640
|
|
},
|
|
{
|
|
"entropy": 5.856866359710693,
|
|
"epoch": 0.7263180004200798,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004953582315876904,
|
|
"loss": 5.6948,
|
|
"mean_token_accuracy": 0.15004287138581276,
|
|
"num_tokens": 15959659.0,
|
|
"step": 8645
|
|
},
|
|
{
|
|
"entropy": 5.730572128295899,
|
|
"epoch": 0.7267380802352447,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000495352178167915,
|
|
"loss": 5.5797,
|
|
"mean_token_accuracy": 0.16224073469638825,
|
|
"num_tokens": 15968102.0,
|
|
"step": 8650
|
|
},
|
|
{
|
|
"entropy": 5.820402717590332,
|
|
"epoch": 0.7271581600504096,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004953461208447143,
|
|
"loss": 5.6882,
|
|
"mean_token_accuracy": 0.14849866628646852,
|
|
"num_tokens": 15977705.0,
|
|
"step": 8655
|
|
},
|
|
{
|
|
"entropy": 5.7769698143005375,
|
|
"epoch": 0.7275782398655745,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004953400596181953,
|
|
"loss": 5.6721,
|
|
"mean_token_accuracy": 0.1488412395119667,
|
|
"num_tokens": 15986703.0,
|
|
"step": 8660
|
|
},
|
|
{
|
|
"entropy": 5.752463150024414,
|
|
"epoch": 0.7279983196807394,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004953339944884657,
|
|
"loss": 5.5871,
|
|
"mean_token_accuracy": 0.1577693849802017,
|
|
"num_tokens": 15995672.0,
|
|
"step": 8665
|
|
},
|
|
{
|
|
"entropy": 5.61082501411438,
|
|
"epoch": 0.7284183994959043,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004953279254556329,
|
|
"loss": 5.5399,
|
|
"mean_token_accuracy": 0.1641200602054596,
|
|
"num_tokens": 16004437.0,
|
|
"step": 8670
|
|
},
|
|
{
|
|
"entropy": 5.707981443405151,
|
|
"epoch": 0.7288384793110692,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004953218525198043,
|
|
"loss": 5.575,
|
|
"mean_token_accuracy": 0.15932455360889436,
|
|
"num_tokens": 16012847.0,
|
|
"step": 8675
|
|
},
|
|
{
|
|
"entropy": 5.810865259170532,
|
|
"epoch": 0.7292585591262339,
|
|
"grad_norm": 2.28125,
|
|
"learning_rate": 0.0004953157756810876,
|
|
"loss": 5.6175,
|
|
"mean_token_accuracy": 0.15550039410591127,
|
|
"num_tokens": 16022213.0,
|
|
"step": 8680
|
|
},
|
|
{
|
|
"entropy": 5.780650091171265,
|
|
"epoch": 0.7296786389413988,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004953096949395902,
|
|
"loss": 5.6738,
|
|
"mean_token_accuracy": 0.15553813129663469,
|
|
"num_tokens": 16031411.0,
|
|
"step": 8685
|
|
},
|
|
{
|
|
"entropy": 5.803027772903443,
|
|
"epoch": 0.7300987187565637,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004953036102954202,
|
|
"loss": 5.7067,
|
|
"mean_token_accuracy": 0.15027424544095994,
|
|
"num_tokens": 16041227.0,
|
|
"step": 8690
|
|
},
|
|
{
|
|
"entropy": 5.769024753570557,
|
|
"epoch": 0.7305187985717286,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004952975217486852,
|
|
"loss": 5.5608,
|
|
"mean_token_accuracy": 0.15838102549314498,
|
|
"num_tokens": 16049777.0,
|
|
"step": 8695
|
|
},
|
|
{
|
|
"entropy": 5.784084749221802,
|
|
"epoch": 0.7309388783868935,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004952914292994928,
|
|
"loss": 5.6425,
|
|
"mean_token_accuracy": 0.15665655136108397,
|
|
"num_tokens": 16059093.0,
|
|
"step": 8700
|
|
},
|
|
{
|
|
"entropy": 5.746081352233887,
|
|
"epoch": 0.7313589582020584,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004952853329479514,
|
|
"loss": 5.6752,
|
|
"mean_token_accuracy": 0.1572483465075493,
|
|
"num_tokens": 16068550.0,
|
|
"step": 8705
|
|
},
|
|
{
|
|
"entropy": 5.760213088989258,
|
|
"epoch": 0.7317790380172233,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004952792326941686,
|
|
"loss": 5.6979,
|
|
"mean_token_accuracy": 0.15022268071770667,
|
|
"num_tokens": 16078286.0,
|
|
"step": 8710
|
|
},
|
|
{
|
|
"entropy": 5.767646551132202,
|
|
"epoch": 0.7321991178323881,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004952731285382527,
|
|
"loss": 5.638,
|
|
"mean_token_accuracy": 0.15017070174217223,
|
|
"num_tokens": 16087560.0,
|
|
"step": 8715
|
|
},
|
|
{
|
|
"entropy": 5.730300951004028,
|
|
"epoch": 0.732619197647553,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004952670204803118,
|
|
"loss": 5.6012,
|
|
"mean_token_accuracy": 0.16169655174016953,
|
|
"num_tokens": 16097478.0,
|
|
"step": 8720
|
|
},
|
|
{
|
|
"entropy": 5.802074241638183,
|
|
"epoch": 0.7330392774627179,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004952609085204539,
|
|
"loss": 5.6932,
|
|
"mean_token_accuracy": 0.16233148872852327,
|
|
"num_tokens": 16106884.0,
|
|
"step": 8725
|
|
},
|
|
{
|
|
"entropy": 5.745241928100586,
|
|
"epoch": 0.7334593572778828,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004952547926587876,
|
|
"loss": 5.6056,
|
|
"mean_token_accuracy": 0.14735328257083893,
|
|
"num_tokens": 16115689.0,
|
|
"step": 8730
|
|
},
|
|
{
|
|
"entropy": 5.717710161209107,
|
|
"epoch": 0.7338794370930477,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004952486728954209,
|
|
"loss": 5.5406,
|
|
"mean_token_accuracy": 0.1627890720963478,
|
|
"num_tokens": 16125237.0,
|
|
"step": 8735
|
|
},
|
|
{
|
|
"entropy": 5.711509656906128,
|
|
"epoch": 0.7342995169082126,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004952425492304624,
|
|
"loss": 5.5612,
|
|
"mean_token_accuracy": 0.1623712345957756,
|
|
"num_tokens": 16133940.0,
|
|
"step": 8740
|
|
},
|
|
{
|
|
"entropy": 5.761761999130249,
|
|
"epoch": 0.7347195967233774,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004952364216640207,
|
|
"loss": 5.6536,
|
|
"mean_token_accuracy": 0.1498600222170353,
|
|
"num_tokens": 16143256.0,
|
|
"step": 8745
|
|
},
|
|
{
|
|
"entropy": 5.749999523162842,
|
|
"epoch": 0.7351396765385423,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000495230290196204,
|
|
"loss": 5.5377,
|
|
"mean_token_accuracy": 0.1550111636519432,
|
|
"num_tokens": 16153259.0,
|
|
"step": 8750
|
|
},
|
|
{
|
|
"entropy": 5.784195375442505,
|
|
"epoch": 0.7355597563537072,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004952241548271212,
|
|
"loss": 5.7846,
|
|
"mean_token_accuracy": 0.14801348447799684,
|
|
"num_tokens": 16162125.0,
|
|
"step": 8755
|
|
},
|
|
{
|
|
"entropy": 5.844126415252686,
|
|
"epoch": 0.7359798361688721,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004952180155568809,
|
|
"loss": 5.6959,
|
|
"mean_token_accuracy": 0.15129489600658416,
|
|
"num_tokens": 16171680.0,
|
|
"step": 8760
|
|
},
|
|
{
|
|
"entropy": 5.847824907302856,
|
|
"epoch": 0.736399915984037,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004952118723855919,
|
|
"loss": 5.7082,
|
|
"mean_token_accuracy": 0.15476434975862502,
|
|
"num_tokens": 16181559.0,
|
|
"step": 8765
|
|
},
|
|
{
|
|
"entropy": 5.748734664916992,
|
|
"epoch": 0.7368199957992019,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004952057253133628,
|
|
"loss": 5.6461,
|
|
"mean_token_accuracy": 0.1511749282479286,
|
|
"num_tokens": 16190611.0,
|
|
"step": 8770
|
|
},
|
|
{
|
|
"entropy": 5.809700012207031,
|
|
"epoch": 0.7372400756143668,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004951995743403028,
|
|
"loss": 5.6609,
|
|
"mean_token_accuracy": 0.1494298830628395,
|
|
"num_tokens": 16200156.0,
|
|
"step": 8775
|
|
},
|
|
{
|
|
"entropy": 5.723226308822632,
|
|
"epoch": 0.7376601554295316,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004951934194665208,
|
|
"loss": 5.626,
|
|
"mean_token_accuracy": 0.14901441484689712,
|
|
"num_tokens": 16209808.0,
|
|
"step": 8780
|
|
},
|
|
{
|
|
"entropy": 5.698558235168457,
|
|
"epoch": 0.7380802352446965,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004951872606921257,
|
|
"loss": 5.5831,
|
|
"mean_token_accuracy": 0.1541206181049347,
|
|
"num_tokens": 16219243.0,
|
|
"step": 8785
|
|
},
|
|
{
|
|
"entropy": 5.68091344833374,
|
|
"epoch": 0.7385003150598614,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004951810980172265,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.1625403195619583,
|
|
"num_tokens": 16228180.0,
|
|
"step": 8790
|
|
},
|
|
{
|
|
"entropy": 5.785898876190186,
|
|
"epoch": 0.7389203948750263,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004951749314419327,
|
|
"loss": 5.6209,
|
|
"mean_token_accuracy": 0.1556001588702202,
|
|
"num_tokens": 16237045.0,
|
|
"step": 8795
|
|
},
|
|
{
|
|
"entropy": 5.7458240509033205,
|
|
"epoch": 0.7393404746901912,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004951687609663533,
|
|
"loss": 5.5277,
|
|
"mean_token_accuracy": 0.16299475580453873,
|
|
"num_tokens": 16245307.0,
|
|
"step": 8800
|
|
},
|
|
{
|
|
"entropy": 5.7147376537323,
|
|
"epoch": 0.739760554505356,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004951625865905977,
|
|
"loss": 5.5793,
|
|
"mean_token_accuracy": 0.1496004968881607,
|
|
"num_tokens": 16255047.0,
|
|
"step": 8805
|
|
},
|
|
{
|
|
"entropy": 5.714106130599975,
|
|
"epoch": 0.740180634320521,
|
|
"grad_norm": 3.0,
|
|
"learning_rate": 0.0004951564083147753,
|
|
"loss": 5.6177,
|
|
"mean_token_accuracy": 0.16076902598142623,
|
|
"num_tokens": 16264969.0,
|
|
"step": 8810
|
|
},
|
|
{
|
|
"entropy": 5.778882122039795,
|
|
"epoch": 0.7406007141356857,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004951502261389953,
|
|
"loss": 5.7085,
|
|
"mean_token_accuracy": 0.14696210622787476,
|
|
"num_tokens": 16274757.0,
|
|
"step": 8815
|
|
},
|
|
{
|
|
"entropy": 5.745514392852783,
|
|
"epoch": 0.7410207939508506,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004951440400633677,
|
|
"loss": 5.6118,
|
|
"mean_token_accuracy": 0.16788006722927093,
|
|
"num_tokens": 16283409.0,
|
|
"step": 8820
|
|
},
|
|
{
|
|
"entropy": 5.71191258430481,
|
|
"epoch": 0.7414408737660155,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004951378500880015,
|
|
"loss": 5.5751,
|
|
"mean_token_accuracy": 0.15628346055746078,
|
|
"num_tokens": 16293206.0,
|
|
"step": 8825
|
|
},
|
|
{
|
|
"entropy": 5.776321268081665,
|
|
"epoch": 0.7418609535811804,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004951316562130067,
|
|
"loss": 5.6155,
|
|
"mean_token_accuracy": 0.15167658925056457,
|
|
"num_tokens": 16303121.0,
|
|
"step": 8830
|
|
},
|
|
{
|
|
"entropy": 5.771509647369385,
|
|
"epoch": 0.7422810333963453,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.000495125458438493,
|
|
"loss": 5.5594,
|
|
"mean_token_accuracy": 0.1614787310361862,
|
|
"num_tokens": 16312710.0,
|
|
"step": 8835
|
|
},
|
|
{
|
|
"entropy": 5.833417177200317,
|
|
"epoch": 0.7427011132115102,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004951192567645702,
|
|
"loss": 5.768,
|
|
"mean_token_accuracy": 0.14854381531476973,
|
|
"num_tokens": 16322280.0,
|
|
"step": 8840
|
|
},
|
|
{
|
|
"entropy": 5.673296499252319,
|
|
"epoch": 0.7431211930266751,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004951130511913481,
|
|
"loss": 5.6098,
|
|
"mean_token_accuracy": 0.1588355764746666,
|
|
"num_tokens": 16331656.0,
|
|
"step": 8845
|
|
},
|
|
{
|
|
"entropy": 5.734766006469727,
|
|
"epoch": 0.7435412728418399,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004951068417189366,
|
|
"loss": 5.63,
|
|
"mean_token_accuracy": 0.15699938535690308,
|
|
"num_tokens": 16341074.0,
|
|
"step": 8850
|
|
},
|
|
{
|
|
"entropy": 5.8237183570861815,
|
|
"epoch": 0.7439613526570048,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004951006283474457,
|
|
"loss": 5.6113,
|
|
"mean_token_accuracy": 0.15277699157595634,
|
|
"num_tokens": 16350097.0,
|
|
"step": 8855
|
|
},
|
|
{
|
|
"entropy": 5.658589601516724,
|
|
"epoch": 0.7443814324721697,
|
|
"grad_norm": 2.796875,
|
|
"learning_rate": 0.0004950944110769856,
|
|
"loss": 5.5342,
|
|
"mean_token_accuracy": 0.15926240533590316,
|
|
"num_tokens": 16359274.0,
|
|
"step": 8860
|
|
},
|
|
{
|
|
"entropy": 5.662064790725708,
|
|
"epoch": 0.7448015122873346,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004950881899076663,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.17023404836654663,
|
|
"num_tokens": 16368445.0,
|
|
"step": 8865
|
|
},
|
|
{
|
|
"entropy": 5.7490709781646725,
|
|
"epoch": 0.7452215921024995,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004950819648395979,
|
|
"loss": 5.6167,
|
|
"mean_token_accuracy": 0.15595170110464096,
|
|
"num_tokens": 16377689.0,
|
|
"step": 8870
|
|
},
|
|
{
|
|
"entropy": 5.68521409034729,
|
|
"epoch": 0.7456416719176644,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000495075735872891,
|
|
"loss": 5.5741,
|
|
"mean_token_accuracy": 0.1552984483540058,
|
|
"num_tokens": 16386713.0,
|
|
"step": 8875
|
|
},
|
|
{
|
|
"entropy": 5.772252321243286,
|
|
"epoch": 0.7460617517328293,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004950695030076557,
|
|
"loss": 5.5904,
|
|
"mean_token_accuracy": 0.14960389882326125,
|
|
"num_tokens": 16395390.0,
|
|
"step": 8880
|
|
},
|
|
{
|
|
"entropy": 5.771421718597412,
|
|
"epoch": 0.7464818315479941,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004950632662440027,
|
|
"loss": 5.6654,
|
|
"mean_token_accuracy": 0.15664495676755905,
|
|
"num_tokens": 16404531.0,
|
|
"step": 8885
|
|
},
|
|
{
|
|
"entropy": 5.691759538650513,
|
|
"epoch": 0.746901911363159,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004950570255820419,
|
|
"loss": 5.5474,
|
|
"mean_token_accuracy": 0.15819469094276428,
|
|
"num_tokens": 16413649.0,
|
|
"step": 8890
|
|
},
|
|
{
|
|
"entropy": 5.676243829727173,
|
|
"epoch": 0.7473219911783239,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004950507810218843,
|
|
"loss": 5.7051,
|
|
"mean_token_accuracy": 0.14750183895230293,
|
|
"num_tokens": 16423247.0,
|
|
"step": 8895
|
|
},
|
|
{
|
|
"entropy": 5.826502656936645,
|
|
"epoch": 0.7477420709934888,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004950445325636405,
|
|
"loss": 5.6103,
|
|
"mean_token_accuracy": 0.15324292033910752,
|
|
"num_tokens": 16432190.0,
|
|
"step": 8900
|
|
},
|
|
{
|
|
"entropy": 5.820600986480713,
|
|
"epoch": 0.7481621508086537,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004950382802074211,
|
|
"loss": 5.573,
|
|
"mean_token_accuracy": 0.15902330875396728,
|
|
"num_tokens": 16443091.0,
|
|
"step": 8905
|
|
},
|
|
{
|
|
"entropy": 5.668134021759033,
|
|
"epoch": 0.7485822306238186,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004950320239533369,
|
|
"loss": 5.6035,
|
|
"mean_token_accuracy": 0.15703300386667252,
|
|
"num_tokens": 16452077.0,
|
|
"step": 8910
|
|
},
|
|
{
|
|
"entropy": 5.829920911788941,
|
|
"epoch": 0.7490023104389834,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004950257638014986,
|
|
"loss": 5.7188,
|
|
"mean_token_accuracy": 0.15111715570092202,
|
|
"num_tokens": 16461893.0,
|
|
"step": 8915
|
|
},
|
|
{
|
|
"entropy": 5.807134437561035,
|
|
"epoch": 0.7494223902541483,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004950194997520172,
|
|
"loss": 5.5552,
|
|
"mean_token_accuracy": 0.1532232567667961,
|
|
"num_tokens": 16470904.0,
|
|
"step": 8920
|
|
},
|
|
{
|
|
"entropy": 5.714702939987182,
|
|
"epoch": 0.7498424700693131,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004950132318050037,
|
|
"loss": 5.6202,
|
|
"mean_token_accuracy": 0.15520734190940857,
|
|
"num_tokens": 16480130.0,
|
|
"step": 8925
|
|
},
|
|
{
|
|
"entropy": 5.704076814651489,
|
|
"epoch": 0.750262549884478,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004950069599605691,
|
|
"loss": 5.675,
|
|
"mean_token_accuracy": 0.1532868653535843,
|
|
"num_tokens": 16489485.0,
|
|
"step": 8930
|
|
},
|
|
{
|
|
"entropy": 5.757295370101929,
|
|
"epoch": 0.750682629699643,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004950006842188245,
|
|
"loss": 5.6303,
|
|
"mean_token_accuracy": 0.15377950444817542,
|
|
"num_tokens": 16498529.0,
|
|
"step": 8935
|
|
},
|
|
{
|
|
"entropy": 5.776675844192505,
|
|
"epoch": 0.7511027095148078,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000494994404579881,
|
|
"loss": 5.5551,
|
|
"mean_token_accuracy": 0.15588496178388594,
|
|
"num_tokens": 16508094.0,
|
|
"step": 8940
|
|
},
|
|
{
|
|
"entropy": 5.746360969543457,
|
|
"epoch": 0.7515227893299727,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00049498812104385,
|
|
"loss": 5.648,
|
|
"mean_token_accuracy": 0.15265263319015504,
|
|
"num_tokens": 16517620.0,
|
|
"step": 8945
|
|
},
|
|
{
|
|
"entropy": 5.644059801101685,
|
|
"epoch": 0.7519428691451375,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004949818336108425,
|
|
"loss": 5.6292,
|
|
"mean_token_accuracy": 0.1522407725453377,
|
|
"num_tokens": 16526720.0,
|
|
"step": 8950
|
|
},
|
|
{
|
|
"entropy": 5.75996880531311,
|
|
"epoch": 0.7523629489603024,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004949755422809703,
|
|
"loss": 5.6226,
|
|
"mean_token_accuracy": 0.15166783481836318,
|
|
"num_tokens": 16535979.0,
|
|
"step": 8955
|
|
},
|
|
{
|
|
"entropy": 5.7575671672821045,
|
|
"epoch": 0.7527830287754673,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004949692470543446,
|
|
"loss": 5.5063,
|
|
"mean_token_accuracy": 0.16554583013057708,
|
|
"num_tokens": 16544538.0,
|
|
"step": 8960
|
|
},
|
|
{
|
|
"entropy": 5.661995124816895,
|
|
"epoch": 0.7532031085906322,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004949629479310769,
|
|
"loss": 5.5843,
|
|
"mean_token_accuracy": 0.15953092277050018,
|
|
"num_tokens": 16553962.0,
|
|
"step": 8965
|
|
},
|
|
{
|
|
"entropy": 5.749283409118652,
|
|
"epoch": 0.7536231884057971,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004949566449112788,
|
|
"loss": 5.5071,
|
|
"mean_token_accuracy": 0.1593552276492119,
|
|
"num_tokens": 16562652.0,
|
|
"step": 8970
|
|
},
|
|
{
|
|
"entropy": 5.792291498184204,
|
|
"epoch": 0.754043268220962,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004949503379950621,
|
|
"loss": 5.5993,
|
|
"mean_token_accuracy": 0.15694511979818343,
|
|
"num_tokens": 16570887.0,
|
|
"step": 8975
|
|
},
|
|
{
|
|
"entropy": 5.760328102111816,
|
|
"epoch": 0.7544633480361269,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004949440271825385,
|
|
"loss": 5.7427,
|
|
"mean_token_accuracy": 0.15050431489944457,
|
|
"num_tokens": 16581469.0,
|
|
"step": 8980
|
|
},
|
|
{
|
|
"entropy": 5.7655247211456295,
|
|
"epoch": 0.7548834278512917,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004949377124738196,
|
|
"loss": 5.621,
|
|
"mean_token_accuracy": 0.14822689667344094,
|
|
"num_tokens": 16590213.0,
|
|
"step": 8985
|
|
},
|
|
{
|
|
"entropy": 5.710941362380981,
|
|
"epoch": 0.7553035076664566,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004949313938690174,
|
|
"loss": 5.6041,
|
|
"mean_token_accuracy": 0.1535157397389412,
|
|
"num_tokens": 16598384.0,
|
|
"step": 8990
|
|
},
|
|
{
|
|
"entropy": 5.6081578731536865,
|
|
"epoch": 0.7557235874816215,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004949250713682438,
|
|
"loss": 5.5748,
|
|
"mean_token_accuracy": 0.15989954620599747,
|
|
"num_tokens": 16607670.0,
|
|
"step": 8995
|
|
},
|
|
{
|
|
"entropy": 5.772069692611694,
|
|
"epoch": 0.7561436672967864,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004949187449716107,
|
|
"loss": 5.6748,
|
|
"mean_token_accuracy": 0.15062423944473266,
|
|
"num_tokens": 16617560.0,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"epoch": 0.7561436672967864,
|
|
"eval_entropy": 5.61633398843715,
|
|
"eval_loss": 5.63568115234375,
|
|
"eval_mean_token_accuracy": 0.16088676016583953,
|
|
"eval_num_tokens": 16617560.0,
|
|
"eval_runtime": 27.2667,
|
|
"eval_samples_per_second": 1370.39,
|
|
"eval_steps_per_second": 171.308,
|
|
"step": 9000
|
|
},
|
|
{
|
|
"entropy": 5.758602285385132,
|
|
"epoch": 0.7565637471119513,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004949124146792304,
|
|
"loss": 5.5707,
|
|
"mean_token_accuracy": 0.16094929501414298,
|
|
"num_tokens": 16626038.0,
|
|
"step": 9005
|
|
},
|
|
{
|
|
"entropy": 5.69229474067688,
|
|
"epoch": 0.7569838269271162,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004949060804912149,
|
|
"loss": 5.5829,
|
|
"mean_token_accuracy": 0.15550542622804642,
|
|
"num_tokens": 16636490.0,
|
|
"step": 9010
|
|
},
|
|
{
|
|
"entropy": 5.739988565444946,
|
|
"epoch": 0.7574039067422811,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004948997424076764,
|
|
"loss": 5.5949,
|
|
"mean_token_accuracy": 0.15632076263427735,
|
|
"num_tokens": 16645369.0,
|
|
"step": 9015
|
|
},
|
|
{
|
|
"entropy": 5.8396875858306885,
|
|
"epoch": 0.7578239865574459,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004948934004287272,
|
|
"loss": 5.6646,
|
|
"mean_token_accuracy": 0.15439294129610062,
|
|
"num_tokens": 16654348.0,
|
|
"step": 9020
|
|
},
|
|
{
|
|
"entropy": 5.805532503128052,
|
|
"epoch": 0.7582440663726108,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004948870545544796,
|
|
"loss": 5.6685,
|
|
"mean_token_accuracy": 0.14863917008042335,
|
|
"num_tokens": 16664009.0,
|
|
"step": 9025
|
|
},
|
|
{
|
|
"entropy": 5.720396566390991,
|
|
"epoch": 0.7586641461877757,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000494880704785046,
|
|
"loss": 5.6789,
|
|
"mean_token_accuracy": 0.15146776288747787,
|
|
"num_tokens": 16674079.0,
|
|
"step": 9030
|
|
},
|
|
{
|
|
"entropy": 5.800774335861206,
|
|
"epoch": 0.7590842260029406,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004948743511205392,
|
|
"loss": 5.6275,
|
|
"mean_token_accuracy": 0.1500827968120575,
|
|
"num_tokens": 16683687.0,
|
|
"step": 9035
|
|
},
|
|
{
|
|
"entropy": 5.75395884513855,
|
|
"epoch": 0.7595043058181055,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004948679935610712,
|
|
"loss": 5.5145,
|
|
"mean_token_accuracy": 0.16577970534563063,
|
|
"num_tokens": 16693311.0,
|
|
"step": 9040
|
|
},
|
|
{
|
|
"entropy": 5.70733790397644,
|
|
"epoch": 0.7599243856332704,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000494861632106755,
|
|
"loss": 5.5713,
|
|
"mean_token_accuracy": 0.15583091229200363,
|
|
"num_tokens": 16702121.0,
|
|
"step": 9045
|
|
},
|
|
{
|
|
"entropy": 5.73174524307251,
|
|
"epoch": 0.7603444654484351,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004948552667577033,
|
|
"loss": 5.5963,
|
|
"mean_token_accuracy": 0.15507323294878006,
|
|
"num_tokens": 16711883.0,
|
|
"step": 9050
|
|
},
|
|
{
|
|
"entropy": 5.765106439590454,
|
|
"epoch": 0.7607645452636,
|
|
"grad_norm": 3.609375,
|
|
"learning_rate": 0.0004948488975140286,
|
|
"loss": 5.6685,
|
|
"mean_token_accuracy": 0.14950548410415648,
|
|
"num_tokens": 16721449.0,
|
|
"step": 9055
|
|
},
|
|
{
|
|
"entropy": 5.715044593811035,
|
|
"epoch": 0.7611846250787649,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000494842524375844,
|
|
"loss": 5.5945,
|
|
"mean_token_accuracy": 0.153316530585289,
|
|
"num_tokens": 16730068.0,
|
|
"step": 9060
|
|
},
|
|
{
|
|
"entropy": 5.686846685409546,
|
|
"epoch": 0.7616047048939298,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004948361473432623,
|
|
"loss": 5.6167,
|
|
"mean_token_accuracy": 0.1580433189868927,
|
|
"num_tokens": 16739970.0,
|
|
"step": 9065
|
|
},
|
|
{
|
|
"entropy": 5.780406522750854,
|
|
"epoch": 0.7620247847090947,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004948297664163964,
|
|
"loss": 5.674,
|
|
"mean_token_accuracy": 0.14935452342033387,
|
|
"num_tokens": 16749461.0,
|
|
"step": 9070
|
|
},
|
|
{
|
|
"entropy": 5.811807107925415,
|
|
"epoch": 0.7624448645242596,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004948233815953593,
|
|
"loss": 5.7538,
|
|
"mean_token_accuracy": 0.15181455612182618,
|
|
"num_tokens": 16758747.0,
|
|
"step": 9075
|
|
},
|
|
{
|
|
"entropy": 5.731409168243408,
|
|
"epoch": 0.7628649443394245,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004948169928802643,
|
|
"loss": 5.4593,
|
|
"mean_token_accuracy": 0.16504789292812347,
|
|
"num_tokens": 16767212.0,
|
|
"step": 9080
|
|
},
|
|
{
|
|
"entropy": 5.768598890304565,
|
|
"epoch": 0.7632850241545893,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004948106002712245,
|
|
"loss": 5.6169,
|
|
"mean_token_accuracy": 0.15554029792547225,
|
|
"num_tokens": 16776514.0,
|
|
"step": 9085
|
|
},
|
|
{
|
|
"entropy": 5.767139911651611,
|
|
"epoch": 0.7637051039697542,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004948042037683529,
|
|
"loss": 5.6008,
|
|
"mean_token_accuracy": 0.15485090836882592,
|
|
"num_tokens": 16786310.0,
|
|
"step": 9090
|
|
},
|
|
{
|
|
"entropy": 5.766057252883911,
|
|
"epoch": 0.7641251837849191,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004947978033717632,
|
|
"loss": 5.6302,
|
|
"mean_token_accuracy": 0.1551863893866539,
|
|
"num_tokens": 16795551.0,
|
|
"step": 9095
|
|
},
|
|
{
|
|
"entropy": 5.779314613342285,
|
|
"epoch": 0.764545263600084,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004947913990815684,
|
|
"loss": 5.6047,
|
|
"mean_token_accuracy": 0.156571102142334,
|
|
"num_tokens": 16805099.0,
|
|
"step": 9100
|
|
},
|
|
{
|
|
"entropy": 5.7028850555419925,
|
|
"epoch": 0.7649653434152489,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.0004947849908978824,
|
|
"loss": 5.6365,
|
|
"mean_token_accuracy": 0.15258645862340928,
|
|
"num_tokens": 16813963.0,
|
|
"step": 9105
|
|
},
|
|
{
|
|
"entropy": 5.773057985305786,
|
|
"epoch": 0.7653854232304138,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004947785788208182,
|
|
"loss": 5.6729,
|
|
"mean_token_accuracy": 0.15363204330205918,
|
|
"num_tokens": 16822814.0,
|
|
"step": 9110
|
|
},
|
|
{
|
|
"entropy": 5.817337226867676,
|
|
"epoch": 0.7658055030455787,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004947721628504898,
|
|
"loss": 5.7105,
|
|
"mean_token_accuracy": 0.14875250086188316,
|
|
"num_tokens": 16831906.0,
|
|
"step": 9115
|
|
},
|
|
{
|
|
"entropy": 5.712357425689698,
|
|
"epoch": 0.7662255828607435,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004947657429870108,
|
|
"loss": 5.5169,
|
|
"mean_token_accuracy": 0.1595179632306099,
|
|
"num_tokens": 16840050.0,
|
|
"step": 9120
|
|
},
|
|
{
|
|
"entropy": 5.650595998764038,
|
|
"epoch": 0.7666456626759084,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004947593192304946,
|
|
"loss": 5.5462,
|
|
"mean_token_accuracy": 0.15449218600988388,
|
|
"num_tokens": 16848404.0,
|
|
"step": 9125
|
|
},
|
|
{
|
|
"entropy": 5.7080615043640135,
|
|
"epoch": 0.7670657424910733,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004947528915810554,
|
|
"loss": 5.5407,
|
|
"mean_token_accuracy": 0.15641333758831025,
|
|
"num_tokens": 16856568.0,
|
|
"step": 9130
|
|
},
|
|
{
|
|
"entropy": 5.756184530258179,
|
|
"epoch": 0.7674858223062382,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004947464600388066,
|
|
"loss": 5.5755,
|
|
"mean_token_accuracy": 0.16077920794487,
|
|
"num_tokens": 16864936.0,
|
|
"step": 9135
|
|
},
|
|
{
|
|
"entropy": 5.886310577392578,
|
|
"epoch": 0.7679059021214031,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004947400246038627,
|
|
"loss": 5.7142,
|
|
"mean_token_accuracy": 0.1526971772313118,
|
|
"num_tokens": 16874504.0,
|
|
"step": 9140
|
|
},
|
|
{
|
|
"entropy": 5.670875883102417,
|
|
"epoch": 0.768325981936568,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004947335852763374,
|
|
"loss": 5.4664,
|
|
"mean_token_accuracy": 0.15701207965612413,
|
|
"num_tokens": 16883365.0,
|
|
"step": 9145
|
|
},
|
|
{
|
|
"entropy": 5.751158428192139,
|
|
"epoch": 0.7687460617517329,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004947271420563447,
|
|
"loss": 5.7208,
|
|
"mean_token_accuracy": 0.14481201022863388,
|
|
"num_tokens": 16892701.0,
|
|
"step": 9150
|
|
},
|
|
{
|
|
"entropy": 5.7310854434967045,
|
|
"epoch": 0.7691661415668977,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004947206949439989,
|
|
"loss": 5.5348,
|
|
"mean_token_accuracy": 0.1528614804148674,
|
|
"num_tokens": 16901864.0,
|
|
"step": 9155
|
|
},
|
|
{
|
|
"entropy": 5.725323820114136,
|
|
"epoch": 0.7695862213820626,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.000494714243939414,
|
|
"loss": 5.5737,
|
|
"mean_token_accuracy": 0.16332917958498,
|
|
"num_tokens": 16910908.0,
|
|
"step": 9160
|
|
},
|
|
{
|
|
"entropy": 5.7280755043029785,
|
|
"epoch": 0.7700063011972275,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004947077890427045,
|
|
"loss": 5.6237,
|
|
"mean_token_accuracy": 0.1593033030629158,
|
|
"num_tokens": 16920299.0,
|
|
"step": 9165
|
|
},
|
|
{
|
|
"entropy": 5.845327138900757,
|
|
"epoch": 0.7704263810123924,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004947013302539846,
|
|
"loss": 5.7456,
|
|
"mean_token_accuracy": 0.1439913384616375,
|
|
"num_tokens": 16930027.0,
|
|
"step": 9170
|
|
},
|
|
{
|
|
"entropy": 5.800181293487549,
|
|
"epoch": 0.7708464608275573,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004946948675733688,
|
|
"loss": 5.6265,
|
|
"mean_token_accuracy": 0.15197980552911758,
|
|
"num_tokens": 16939387.0,
|
|
"step": 9175
|
|
},
|
|
{
|
|
"entropy": 5.726575946807861,
|
|
"epoch": 0.7712665406427222,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004946884010009714,
|
|
"loss": 5.5999,
|
|
"mean_token_accuracy": 0.15767696648836135,
|
|
"num_tokens": 16950024.0,
|
|
"step": 9180
|
|
},
|
|
{
|
|
"entropy": 5.690348434448242,
|
|
"epoch": 0.771686620457887,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004946819305369073,
|
|
"loss": 5.4956,
|
|
"mean_token_accuracy": 0.16214724481105805,
|
|
"num_tokens": 16958219.0,
|
|
"step": 9185
|
|
},
|
|
{
|
|
"entropy": 5.695686388015747,
|
|
"epoch": 0.7721067002730518,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004946754561812909,
|
|
"loss": 5.4751,
|
|
"mean_token_accuracy": 0.16501737236976624,
|
|
"num_tokens": 16966829.0,
|
|
"step": 9190
|
|
},
|
|
{
|
|
"entropy": 5.6394744396209715,
|
|
"epoch": 0.7725267800882167,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004946689779342367,
|
|
"loss": 5.5829,
|
|
"mean_token_accuracy": 0.157248455286026,
|
|
"num_tokens": 16975585.0,
|
|
"step": 9195
|
|
},
|
|
{
|
|
"entropy": 5.688712549209595,
|
|
"epoch": 0.7729468599033816,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004946624957958599,
|
|
"loss": 5.5508,
|
|
"mean_token_accuracy": 0.15958887487649917,
|
|
"num_tokens": 16984848.0,
|
|
"step": 9200
|
|
},
|
|
{
|
|
"entropy": 5.699162340164184,
|
|
"epoch": 0.7733669397185465,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.000494656009766275,
|
|
"loss": 5.5601,
|
|
"mean_token_accuracy": 0.16677376627922058,
|
|
"num_tokens": 16993179.0,
|
|
"step": 9205
|
|
},
|
|
{
|
|
"entropy": 5.676146554946899,
|
|
"epoch": 0.7737870195337114,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000494649519845597,
|
|
"loss": 5.58,
|
|
"mean_token_accuracy": 0.15755597352981568,
|
|
"num_tokens": 17002563.0,
|
|
"step": 9210
|
|
},
|
|
{
|
|
"entropy": 5.762381267547608,
|
|
"epoch": 0.7742070993488763,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004946430260339409,
|
|
"loss": 5.6151,
|
|
"mean_token_accuracy": 0.15177177786827087,
|
|
"num_tokens": 17011805.0,
|
|
"step": 9215
|
|
},
|
|
{
|
|
"entropy": 5.730970144271851,
|
|
"epoch": 0.7746271791640411,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004946365283314216,
|
|
"loss": 5.5539,
|
|
"mean_token_accuracy": 0.15288405269384384,
|
|
"num_tokens": 17020398.0,
|
|
"step": 9220
|
|
},
|
|
{
|
|
"entropy": 5.67476453781128,
|
|
"epoch": 0.775047258979206,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004946300267381545,
|
|
"loss": 5.5585,
|
|
"mean_token_accuracy": 0.15727789849042892,
|
|
"num_tokens": 17030805.0,
|
|
"step": 9225
|
|
},
|
|
{
|
|
"entropy": 5.746345138549804,
|
|
"epoch": 0.7754673387943709,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004946235212542544,
|
|
"loss": 5.5945,
|
|
"mean_token_accuracy": 0.15904315412044526,
|
|
"num_tokens": 17040164.0,
|
|
"step": 9230
|
|
},
|
|
{
|
|
"entropy": 5.746207189559937,
|
|
"epoch": 0.7758874186095358,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004946170118798367,
|
|
"loss": 5.6473,
|
|
"mean_token_accuracy": 0.15356625765562057,
|
|
"num_tokens": 17049519.0,
|
|
"step": 9235
|
|
},
|
|
{
|
|
"entropy": 5.743830728530884,
|
|
"epoch": 0.7763074984247007,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004946104986150167,
|
|
"loss": 5.5775,
|
|
"mean_token_accuracy": 0.15959766507148743,
|
|
"num_tokens": 17058042.0,
|
|
"step": 9240
|
|
},
|
|
{
|
|
"entropy": 5.7007852554321286,
|
|
"epoch": 0.7767275782398656,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004946039814599099,
|
|
"loss": 5.6015,
|
|
"mean_token_accuracy": 0.1615740664303303,
|
|
"num_tokens": 17067107.0,
|
|
"step": 9245
|
|
},
|
|
{
|
|
"entropy": 5.7166907787323,
|
|
"epoch": 0.7771476580550305,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004945974604146316,
|
|
"loss": 5.6825,
|
|
"mean_token_accuracy": 0.16084015890955924,
|
|
"num_tokens": 17076975.0,
|
|
"step": 9250
|
|
},
|
|
{
|
|
"entropy": 5.709249782562256,
|
|
"epoch": 0.7775677378701953,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004945909354792974,
|
|
"loss": 5.5367,
|
|
"mean_token_accuracy": 0.1583693414926529,
|
|
"num_tokens": 17086405.0,
|
|
"step": 9255
|
|
},
|
|
{
|
|
"entropy": 5.691678667068482,
|
|
"epoch": 0.7779878176853602,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004945844066540229,
|
|
"loss": 5.6114,
|
|
"mean_token_accuracy": 0.15048627853393554,
|
|
"num_tokens": 17095333.0,
|
|
"step": 9260
|
|
},
|
|
{
|
|
"entropy": 5.744761037826538,
|
|
"epoch": 0.7784078975005251,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004945778739389236,
|
|
"loss": 5.6402,
|
|
"mean_token_accuracy": 0.15518757700920105,
|
|
"num_tokens": 17103631.0,
|
|
"step": 9265
|
|
},
|
|
{
|
|
"entropy": 5.740716123580933,
|
|
"epoch": 0.77882797731569,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004945713373341152,
|
|
"loss": 5.5545,
|
|
"mean_token_accuracy": 0.15527277439832687,
|
|
"num_tokens": 17112612.0,
|
|
"step": 9270
|
|
},
|
|
{
|
|
"entropy": 5.7405472755432125,
|
|
"epoch": 0.7792480571308549,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004945647968397139,
|
|
"loss": 5.5985,
|
|
"mean_token_accuracy": 0.15742275416851043,
|
|
"num_tokens": 17121592.0,
|
|
"step": 9275
|
|
},
|
|
{
|
|
"entropy": 5.7078413486480715,
|
|
"epoch": 0.7796681369460198,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004945582524558352,
|
|
"loss": 5.6183,
|
|
"mean_token_accuracy": 0.15746289044618605,
|
|
"num_tokens": 17131003.0,
|
|
"step": 9280
|
|
},
|
|
{
|
|
"entropy": 5.772906827926636,
|
|
"epoch": 0.7800882167611847,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000494551704182595,
|
|
"loss": 5.621,
|
|
"mean_token_accuracy": 0.15148474127054215,
|
|
"num_tokens": 17140013.0,
|
|
"step": 9285
|
|
},
|
|
{
|
|
"entropy": 5.876817035675049,
|
|
"epoch": 0.7805082965763495,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004945451520201095,
|
|
"loss": 5.7745,
|
|
"mean_token_accuracy": 0.144088314473629,
|
|
"num_tokens": 17150406.0,
|
|
"step": 9290
|
|
},
|
|
{
|
|
"entropy": 5.749739122390747,
|
|
"epoch": 0.7809283763915144,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004945385959684947,
|
|
"loss": 5.6295,
|
|
"mean_token_accuracy": 0.15528156161308287,
|
|
"num_tokens": 17159757.0,
|
|
"step": 9295
|
|
},
|
|
{
|
|
"entropy": 5.772205591201782,
|
|
"epoch": 0.7813484562066793,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004945320360278667,
|
|
"loss": 5.6364,
|
|
"mean_token_accuracy": 0.16099981665611268,
|
|
"num_tokens": 17169317.0,
|
|
"step": 9300
|
|
},
|
|
{
|
|
"entropy": 5.811273384094238,
|
|
"epoch": 0.7817685360218442,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004945254721983416,
|
|
"loss": 5.6388,
|
|
"mean_token_accuracy": 0.16424707993865012,
|
|
"num_tokens": 17178410.0,
|
|
"step": 9305
|
|
},
|
|
{
|
|
"entropy": 5.776491260528564,
|
|
"epoch": 0.782188615837009,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.000494518904480036,
|
|
"loss": 5.5526,
|
|
"mean_token_accuracy": 0.15707804411649703,
|
|
"num_tokens": 17186922.0,
|
|
"step": 9310
|
|
},
|
|
{
|
|
"entropy": 5.737505054473877,
|
|
"epoch": 0.782608695652174,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004945123328730659,
|
|
"loss": 5.651,
|
|
"mean_token_accuracy": 0.15191663503646852,
|
|
"num_tokens": 17197125.0,
|
|
"step": 9315
|
|
},
|
|
{
|
|
"entropy": 5.706150245666504,
|
|
"epoch": 0.7830287754673388,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.000494505757377548,
|
|
"loss": 5.5702,
|
|
"mean_token_accuracy": 0.15521910637617112,
|
|
"num_tokens": 17206169.0,
|
|
"step": 9320
|
|
},
|
|
{
|
|
"entropy": 5.650849008560181,
|
|
"epoch": 0.7834488552825036,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004944991779935985,
|
|
"loss": 5.5078,
|
|
"mean_token_accuracy": 0.15866184681653978,
|
|
"num_tokens": 17214607.0,
|
|
"step": 9325
|
|
},
|
|
{
|
|
"entropy": 5.662286853790283,
|
|
"epoch": 0.7838689350976685,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000494492594721334,
|
|
"loss": 5.4998,
|
|
"mean_token_accuracy": 0.1558862790465355,
|
|
"num_tokens": 17223616.0,
|
|
"step": 9330
|
|
},
|
|
{
|
|
"entropy": 5.785139083862305,
|
|
"epoch": 0.7842890149128334,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004944860075608715,
|
|
"loss": 5.591,
|
|
"mean_token_accuracy": 0.15421077311038972,
|
|
"num_tokens": 17232729.0,
|
|
"step": 9335
|
|
},
|
|
{
|
|
"entropy": 5.724941205978394,
|
|
"epoch": 0.7847090947279983,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004944794165123272,
|
|
"loss": 5.6489,
|
|
"mean_token_accuracy": 0.15867509245872496,
|
|
"num_tokens": 17242128.0,
|
|
"step": 9340
|
|
},
|
|
{
|
|
"entropy": 5.691767406463623,
|
|
"epoch": 0.7851291745431632,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.000494472821575818,
|
|
"loss": 5.538,
|
|
"mean_token_accuracy": 0.15931878685951234,
|
|
"num_tokens": 17250806.0,
|
|
"step": 9345
|
|
},
|
|
{
|
|
"entropy": 5.807684230804443,
|
|
"epoch": 0.7855492543583281,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004944662227514609,
|
|
"loss": 5.7718,
|
|
"mean_token_accuracy": 0.14448002949357033,
|
|
"num_tokens": 17260888.0,
|
|
"step": 9350
|
|
},
|
|
{
|
|
"entropy": 5.739693880081177,
|
|
"epoch": 0.785969334173493,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004944596200393726,
|
|
"loss": 5.5242,
|
|
"mean_token_accuracy": 0.16088390797376634,
|
|
"num_tokens": 17270387.0,
|
|
"step": 9355
|
|
},
|
|
{
|
|
"entropy": 5.755869579315186,
|
|
"epoch": 0.7863894139886578,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004944530134396702,
|
|
"loss": 5.5782,
|
|
"mean_token_accuracy": 0.15905830040574073,
|
|
"num_tokens": 17279866.0,
|
|
"step": 9360
|
|
},
|
|
{
|
|
"entropy": 5.681657218933106,
|
|
"epoch": 0.7868094938038227,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004944464029524707,
|
|
"loss": 5.5575,
|
|
"mean_token_accuracy": 0.15929284542798997,
|
|
"num_tokens": 17289233.0,
|
|
"step": 9365
|
|
},
|
|
{
|
|
"entropy": 5.764404249191284,
|
|
"epoch": 0.7872295736189876,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.000494439788577891,
|
|
"loss": 5.6412,
|
|
"mean_token_accuracy": 0.1553253263235092,
|
|
"num_tokens": 17298705.0,
|
|
"step": 9370
|
|
},
|
|
{
|
|
"entropy": 5.760324287414551,
|
|
"epoch": 0.7876496534341525,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004944331703160486,
|
|
"loss": 5.6003,
|
|
"mean_token_accuracy": 0.15822061598300935,
|
|
"num_tokens": 17307793.0,
|
|
"step": 9375
|
|
},
|
|
{
|
|
"entropy": 5.7375633239746096,
|
|
"epoch": 0.7880697332493174,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004944265481670605,
|
|
"loss": 5.6821,
|
|
"mean_token_accuracy": 0.14730169177055358,
|
|
"num_tokens": 17318248.0,
|
|
"step": 9380
|
|
},
|
|
{
|
|
"entropy": 5.723534107208252,
|
|
"epoch": 0.7884898130644823,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004944199221310441,
|
|
"loss": 5.5923,
|
|
"mean_token_accuracy": 0.15378608107566832,
|
|
"num_tokens": 17327281.0,
|
|
"step": 9385
|
|
},
|
|
{
|
|
"entropy": 5.751460123062134,
|
|
"epoch": 0.7889098928796471,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004944132922081168,
|
|
"loss": 5.6041,
|
|
"mean_token_accuracy": 0.16031598746776582,
|
|
"num_tokens": 17336805.0,
|
|
"step": 9390
|
|
},
|
|
{
|
|
"entropy": 5.720217990875244,
|
|
"epoch": 0.789329972694812,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004944066583983961,
|
|
"loss": 5.5563,
|
|
"mean_token_accuracy": 0.15304937809705735,
|
|
"num_tokens": 17346024.0,
|
|
"step": 9395
|
|
},
|
|
{
|
|
"entropy": 5.72339243888855,
|
|
"epoch": 0.7897500525099769,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004944000207019992,
|
|
"loss": 5.638,
|
|
"mean_token_accuracy": 0.15410090163350104,
|
|
"num_tokens": 17355100.0,
|
|
"step": 9400
|
|
},
|
|
{
|
|
"entropy": 5.789851570129395,
|
|
"epoch": 0.7901701323251418,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004943933791190441,
|
|
"loss": 5.6855,
|
|
"mean_token_accuracy": 0.14976395517587662,
|
|
"num_tokens": 17364769.0,
|
|
"step": 9405
|
|
},
|
|
{
|
|
"entropy": 5.733008146286011,
|
|
"epoch": 0.7905902121403067,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004943867336496482,
|
|
"loss": 5.5309,
|
|
"mean_token_accuracy": 0.16207973062992095,
|
|
"num_tokens": 17374082.0,
|
|
"step": 9410
|
|
},
|
|
{
|
|
"entropy": 5.677785396575928,
|
|
"epoch": 0.7910102919554716,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004943800842939293,
|
|
"loss": 5.5769,
|
|
"mean_token_accuracy": 0.15575399696826936,
|
|
"num_tokens": 17383570.0,
|
|
"step": 9415
|
|
},
|
|
{
|
|
"entropy": 5.715662574768066,
|
|
"epoch": 0.7914303717706365,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.000494373431052005,
|
|
"loss": 5.5744,
|
|
"mean_token_accuracy": 0.15315101221203803,
|
|
"num_tokens": 17392105.0,
|
|
"step": 9420
|
|
},
|
|
{
|
|
"entropy": 5.687941646575927,
|
|
"epoch": 0.7918504515858013,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004943667739239935,
|
|
"loss": 5.5394,
|
|
"mean_token_accuracy": 0.16118402928113937,
|
|
"num_tokens": 17401363.0,
|
|
"step": 9425
|
|
},
|
|
{
|
|
"entropy": 5.805943298339844,
|
|
"epoch": 0.7922705314009661,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004943601129100125,
|
|
"loss": 5.5689,
|
|
"mean_token_accuracy": 0.1596407786011696,
|
|
"num_tokens": 17411333.0,
|
|
"step": 9430
|
|
},
|
|
{
|
|
"entropy": 5.742655897140503,
|
|
"epoch": 0.792690611216131,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004943534480101801,
|
|
"loss": 5.6116,
|
|
"mean_token_accuracy": 0.16205159723758697,
|
|
"num_tokens": 17421162.0,
|
|
"step": 9435
|
|
},
|
|
{
|
|
"entropy": 5.675297021865845,
|
|
"epoch": 0.793110691031296,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004943467792246142,
|
|
"loss": 5.5643,
|
|
"mean_token_accuracy": 0.16239771544933318,
|
|
"num_tokens": 17430119.0,
|
|
"step": 9440
|
|
},
|
|
{
|
|
"entropy": 5.73559455871582,
|
|
"epoch": 0.7935307708464608,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004943401065534332,
|
|
"loss": 5.5718,
|
|
"mean_token_accuracy": 0.1576070472598076,
|
|
"num_tokens": 17439617.0,
|
|
"step": 9445
|
|
},
|
|
{
|
|
"entropy": 5.6667726039886475,
|
|
"epoch": 0.7939508506616257,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004943334299967551,
|
|
"loss": 5.6816,
|
|
"mean_token_accuracy": 0.15144198015332222,
|
|
"num_tokens": 17448720.0,
|
|
"step": 9450
|
|
},
|
|
{
|
|
"entropy": 5.692353296279907,
|
|
"epoch": 0.7943709304767906,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004943267495546982,
|
|
"loss": 5.5709,
|
|
"mean_token_accuracy": 0.16103459149599075,
|
|
"num_tokens": 17457458.0,
|
|
"step": 9455
|
|
},
|
|
{
|
|
"entropy": 5.826778411865234,
|
|
"epoch": 0.7947910102919554,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004943200652273809,
|
|
"loss": 5.5988,
|
|
"mean_token_accuracy": 0.1572289600968361,
|
|
"num_tokens": 17467095.0,
|
|
"step": 9460
|
|
},
|
|
{
|
|
"entropy": 5.734735774993896,
|
|
"epoch": 0.7952110901071203,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004943133770149216,
|
|
"loss": 5.6124,
|
|
"mean_token_accuracy": 0.14904361963272095,
|
|
"num_tokens": 17476247.0,
|
|
"step": 9465
|
|
},
|
|
{
|
|
"entropy": 5.758768129348755,
|
|
"epoch": 0.7956311699222852,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004943066849174386,
|
|
"loss": 5.6185,
|
|
"mean_token_accuracy": 0.16090165376663207,
|
|
"num_tokens": 17486352.0,
|
|
"step": 9470
|
|
},
|
|
{
|
|
"entropy": 5.764822721481323,
|
|
"epoch": 0.7960512497374501,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004942999889350508,
|
|
"loss": 5.6045,
|
|
"mean_token_accuracy": 0.1577667087316513,
|
|
"num_tokens": 17495633.0,
|
|
"step": 9475
|
|
},
|
|
{
|
|
"entropy": 5.76590781211853,
|
|
"epoch": 0.796471329552615,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004942932890678765,
|
|
"loss": 5.6398,
|
|
"mean_token_accuracy": 0.1567259430885315,
|
|
"num_tokens": 17504325.0,
|
|
"step": 9480
|
|
},
|
|
{
|
|
"entropy": 5.784614133834839,
|
|
"epoch": 0.7968914093677799,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004942865853160346,
|
|
"loss": 5.6395,
|
|
"mean_token_accuracy": 0.15519623905420304,
|
|
"num_tokens": 17513265.0,
|
|
"step": 9485
|
|
},
|
|
{
|
|
"entropy": 5.779874706268311,
|
|
"epoch": 0.7973114891829448,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004942798776796436,
|
|
"loss": 5.6635,
|
|
"mean_token_accuracy": 0.1501373194158077,
|
|
"num_tokens": 17522939.0,
|
|
"step": 9490
|
|
},
|
|
{
|
|
"entropy": 5.813601112365722,
|
|
"epoch": 0.7977315689981096,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004942731661588226,
|
|
"loss": 5.6776,
|
|
"mean_token_accuracy": 0.14708788096904754,
|
|
"num_tokens": 17532250.0,
|
|
"step": 9495
|
|
},
|
|
{
|
|
"entropy": 5.8093610286712645,
|
|
"epoch": 0.7981516488132745,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004942664507536904,
|
|
"loss": 5.6768,
|
|
"mean_token_accuracy": 0.15758057236671447,
|
|
"num_tokens": 17541368.0,
|
|
"step": 9500
|
|
},
|
|
{
|
|
"entropy": 5.709500455856324,
|
|
"epoch": 0.7985717286284394,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004942597314643659,
|
|
"loss": 5.6119,
|
|
"mean_token_accuracy": 0.15914746522903442,
|
|
"num_tokens": 17550871.0,
|
|
"step": 9505
|
|
},
|
|
{
|
|
"entropy": 5.7264715194702145,
|
|
"epoch": 0.7989918084436043,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004942530082909681,
|
|
"loss": 5.5592,
|
|
"mean_token_accuracy": 0.16579595506191253,
|
|
"num_tokens": 17559683.0,
|
|
"step": 9510
|
|
},
|
|
{
|
|
"entropy": 5.739404821395874,
|
|
"epoch": 0.7994118882587692,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004942462812336163,
|
|
"loss": 5.5622,
|
|
"mean_token_accuracy": 0.15668560117483138,
|
|
"num_tokens": 17568877.0,
|
|
"step": 9515
|
|
},
|
|
{
|
|
"entropy": 5.809852504730225,
|
|
"epoch": 0.7998319680739341,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004942395502924293,
|
|
"loss": 5.6985,
|
|
"mean_token_accuracy": 0.14932294711470603,
|
|
"num_tokens": 17578202.0,
|
|
"step": 9520
|
|
},
|
|
{
|
|
"entropy": 5.7818114280700685,
|
|
"epoch": 0.800252047889099,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004942328154675268,
|
|
"loss": 5.5476,
|
|
"mean_token_accuracy": 0.1580620989203453,
|
|
"num_tokens": 17587342.0,
|
|
"step": 9525
|
|
},
|
|
{
|
|
"entropy": 5.704915952682495,
|
|
"epoch": 0.8006721277042638,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004942260767590277,
|
|
"loss": 5.4071,
|
|
"mean_token_accuracy": 0.16135389506816863,
|
|
"num_tokens": 17595671.0,
|
|
"step": 9530
|
|
},
|
|
{
|
|
"entropy": 5.718443107604981,
|
|
"epoch": 0.8010922075194287,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004942193341670516,
|
|
"loss": 5.7412,
|
|
"mean_token_accuracy": 0.14880940914154053,
|
|
"num_tokens": 17605649.0,
|
|
"step": 9535
|
|
},
|
|
{
|
|
"entropy": 5.7878273010253904,
|
|
"epoch": 0.8015122873345936,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004942125876917178,
|
|
"loss": 5.6307,
|
|
"mean_token_accuracy": 0.15066928714513778,
|
|
"num_tokens": 17615286.0,
|
|
"step": 9540
|
|
},
|
|
{
|
|
"entropy": 5.6786088943481445,
|
|
"epoch": 0.8019323671497585,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000494205837333146,
|
|
"loss": 5.5965,
|
|
"mean_token_accuracy": 0.15368861109018325,
|
|
"num_tokens": 17624583.0,
|
|
"step": 9545
|
|
},
|
|
{
|
|
"entropy": 5.73230299949646,
|
|
"epoch": 0.8023524469649234,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004941990830914557,
|
|
"loss": 5.5956,
|
|
"mean_token_accuracy": 0.15637702941894532,
|
|
"num_tokens": 17633894.0,
|
|
"step": 9550
|
|
},
|
|
{
|
|
"entropy": 5.777722024917603,
|
|
"epoch": 0.8027725267800883,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004941923249667663,
|
|
"loss": 5.6906,
|
|
"mean_token_accuracy": 0.1498961329460144,
|
|
"num_tokens": 17643172.0,
|
|
"step": 9555
|
|
},
|
|
{
|
|
"entropy": 5.776974630355835,
|
|
"epoch": 0.803192606595253,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004941855629591979,
|
|
"loss": 5.564,
|
|
"mean_token_accuracy": 0.15488137155771256,
|
|
"num_tokens": 17651901.0,
|
|
"step": 9560
|
|
},
|
|
{
|
|
"entropy": 5.695595645904541,
|
|
"epoch": 0.8036126864104179,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004941787970688701,
|
|
"loss": 5.5599,
|
|
"mean_token_accuracy": 0.1548061341047287,
|
|
"num_tokens": 17660806.0,
|
|
"step": 9565
|
|
},
|
|
{
|
|
"entropy": 5.745298910140991,
|
|
"epoch": 0.8040327662255828,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004941720272959027,
|
|
"loss": 5.6197,
|
|
"mean_token_accuracy": 0.16593484580516815,
|
|
"num_tokens": 17669157.0,
|
|
"step": 9570
|
|
},
|
|
{
|
|
"entropy": 5.67219820022583,
|
|
"epoch": 0.8044528460407477,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004941652536404157,
|
|
"loss": 5.5339,
|
|
"mean_token_accuracy": 0.15697258710861206,
|
|
"num_tokens": 17678664.0,
|
|
"step": 9575
|
|
},
|
|
{
|
|
"entropy": 5.7390752792358395,
|
|
"epoch": 0.8048729258559126,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004941584761025291,
|
|
"loss": 5.5754,
|
|
"mean_token_accuracy": 0.15572021156549454,
|
|
"num_tokens": 17688252.0,
|
|
"step": 9580
|
|
},
|
|
{
|
|
"entropy": 5.716245555877686,
|
|
"epoch": 0.8052930056710775,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.000494151694682363,
|
|
"loss": 5.6001,
|
|
"mean_token_accuracy": 0.15716407224535942,
|
|
"num_tokens": 17696473.0,
|
|
"step": 9585
|
|
},
|
|
{
|
|
"entropy": 5.7132292747497555,
|
|
"epoch": 0.8057130854862424,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004941449093800374,
|
|
"loss": 5.6374,
|
|
"mean_token_accuracy": 0.15638218745589255,
|
|
"num_tokens": 17706177.0,
|
|
"step": 9590
|
|
},
|
|
{
|
|
"entropy": 5.664974069595337,
|
|
"epoch": 0.8061331653014072,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004941381201956726,
|
|
"loss": 5.4683,
|
|
"mean_token_accuracy": 0.16399573683738708,
|
|
"num_tokens": 17715355.0,
|
|
"step": 9595
|
|
},
|
|
{
|
|
"entropy": 5.719002342224121,
|
|
"epoch": 0.8065532451165721,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004941313271293889,
|
|
"loss": 5.5711,
|
|
"mean_token_accuracy": 0.16259212642908097,
|
|
"num_tokens": 17724345.0,
|
|
"step": 9600
|
|
},
|
|
{
|
|
"entropy": 5.7297220706939695,
|
|
"epoch": 0.806973324931737,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004941245301813065,
|
|
"loss": 5.4994,
|
|
"mean_token_accuracy": 0.16674552410840987,
|
|
"num_tokens": 17732805.0,
|
|
"step": 9605
|
|
},
|
|
{
|
|
"entropy": 5.628268814086914,
|
|
"epoch": 0.8073934047469019,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004941177293515459,
|
|
"loss": 5.5412,
|
|
"mean_token_accuracy": 0.16236957609653474,
|
|
"num_tokens": 17741963.0,
|
|
"step": 9610
|
|
},
|
|
{
|
|
"entropy": 5.633747529983521,
|
|
"epoch": 0.8078134845620668,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004941109246402275,
|
|
"loss": 5.5727,
|
|
"mean_token_accuracy": 0.1555377036333084,
|
|
"num_tokens": 17751858.0,
|
|
"step": 9615
|
|
},
|
|
{
|
|
"entropy": 5.845067882537842,
|
|
"epoch": 0.8082335643772317,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004941041160474721,
|
|
"loss": 5.6876,
|
|
"mean_token_accuracy": 0.15293847322463988,
|
|
"num_tokens": 17761152.0,
|
|
"step": 9620
|
|
},
|
|
{
|
|
"entropy": 5.818953990936279,
|
|
"epoch": 0.8086536441923966,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004940973035733999,
|
|
"loss": 5.6166,
|
|
"mean_token_accuracy": 0.15496550351381302,
|
|
"num_tokens": 17770493.0,
|
|
"step": 9625
|
|
},
|
|
{
|
|
"entropy": 5.828579998016357,
|
|
"epoch": 0.8090737240075614,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004940904872181318,
|
|
"loss": 5.6185,
|
|
"mean_token_accuracy": 0.15005985349416734,
|
|
"num_tokens": 17779871.0,
|
|
"step": 9630
|
|
},
|
|
{
|
|
"entropy": 5.764964485168457,
|
|
"epoch": 0.8094938038227263,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004940836669817887,
|
|
"loss": 5.6368,
|
|
"mean_token_accuracy": 0.14861269071698188,
|
|
"num_tokens": 17788606.0,
|
|
"step": 9635
|
|
},
|
|
{
|
|
"entropy": 5.634247922897339,
|
|
"epoch": 0.8099138836378912,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004940768428644911,
|
|
"loss": 5.5661,
|
|
"mean_token_accuracy": 0.1579072043299675,
|
|
"num_tokens": 17797458.0,
|
|
"step": 9640
|
|
},
|
|
{
|
|
"entropy": 5.667587184906006,
|
|
"epoch": 0.8103339634530561,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004940700148663601,
|
|
"loss": 5.5228,
|
|
"mean_token_accuracy": 0.15832778215408325,
|
|
"num_tokens": 17806902.0,
|
|
"step": 9645
|
|
},
|
|
{
|
|
"entropy": 5.784042167663574,
|
|
"epoch": 0.810754043268221,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004940631829875165,
|
|
"loss": 5.6538,
|
|
"mean_token_accuracy": 0.15309607237577438,
|
|
"num_tokens": 17816374.0,
|
|
"step": 9650
|
|
},
|
|
{
|
|
"entropy": 5.747311544418335,
|
|
"epoch": 0.8111741230833859,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004940563472280815,
|
|
"loss": 5.6394,
|
|
"mean_token_accuracy": 0.15840844437479973,
|
|
"num_tokens": 17825267.0,
|
|
"step": 9655
|
|
},
|
|
{
|
|
"entropy": 5.6849054336547855,
|
|
"epoch": 0.8115942028985508,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004940495075881761,
|
|
"loss": 5.5501,
|
|
"mean_token_accuracy": 0.15928401798009872,
|
|
"num_tokens": 17834027.0,
|
|
"step": 9660
|
|
},
|
|
{
|
|
"entropy": 5.710701990127563,
|
|
"epoch": 0.8120142827137156,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004940426640679214,
|
|
"loss": 5.5574,
|
|
"mean_token_accuracy": 0.15307731851935386,
|
|
"num_tokens": 17843587.0,
|
|
"step": 9665
|
|
},
|
|
{
|
|
"entropy": 5.759457540512085,
|
|
"epoch": 0.8124343625288805,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004940358166674388,
|
|
"loss": 5.5651,
|
|
"mean_token_accuracy": 0.1602780595421791,
|
|
"num_tokens": 17852284.0,
|
|
"step": 9670
|
|
},
|
|
{
|
|
"entropy": 5.7291789054870605,
|
|
"epoch": 0.8128544423440454,
|
|
"grad_norm": 2.96875,
|
|
"learning_rate": 0.0004940289653868494,
|
|
"loss": 5.5959,
|
|
"mean_token_accuracy": 0.1548108696937561,
|
|
"num_tokens": 17860896.0,
|
|
"step": 9675
|
|
},
|
|
{
|
|
"entropy": 5.592970943450927,
|
|
"epoch": 0.8132745221592103,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004940221102262747,
|
|
"loss": 5.5689,
|
|
"mean_token_accuracy": 0.15153344422578813,
|
|
"num_tokens": 17870796.0,
|
|
"step": 9680
|
|
},
|
|
{
|
|
"entropy": 5.760674333572387,
|
|
"epoch": 0.8136946019743752,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004940152511858361,
|
|
"loss": 5.6389,
|
|
"mean_token_accuracy": 0.15776861757040023,
|
|
"num_tokens": 17880016.0,
|
|
"step": 9685
|
|
},
|
|
{
|
|
"entropy": 5.840538072586059,
|
|
"epoch": 0.81411468178954,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004940083882656551,
|
|
"loss": 5.6842,
|
|
"mean_token_accuracy": 0.1527305245399475,
|
|
"num_tokens": 17889348.0,
|
|
"step": 9690
|
|
},
|
|
{
|
|
"entropy": 5.7093421459198,
|
|
"epoch": 0.814534761604705,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004940015214658532,
|
|
"loss": 5.5527,
|
|
"mean_token_accuracy": 0.16001000851392747,
|
|
"num_tokens": 17898392.0,
|
|
"step": 9695
|
|
},
|
|
{
|
|
"entropy": 5.737645149230957,
|
|
"epoch": 0.8149548414198697,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004939946507865522,
|
|
"loss": 5.644,
|
|
"mean_token_accuracy": 0.16057325303554534,
|
|
"num_tokens": 17907141.0,
|
|
"step": 9700
|
|
},
|
|
{
|
|
"entropy": 5.655606842041015,
|
|
"epoch": 0.8153749212350346,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004939877762278737,
|
|
"loss": 5.5158,
|
|
"mean_token_accuracy": 0.16194361746311187,
|
|
"num_tokens": 17915792.0,
|
|
"step": 9705
|
|
},
|
|
{
|
|
"entropy": 5.797022104263306,
|
|
"epoch": 0.8157950010501995,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004939808977899396,
|
|
"loss": 5.6857,
|
|
"mean_token_accuracy": 0.14906044080853462,
|
|
"num_tokens": 17925603.0,
|
|
"step": 9710
|
|
},
|
|
{
|
|
"entropy": 5.802898216247558,
|
|
"epoch": 0.8162150808653644,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004939740154728716,
|
|
"loss": 5.6278,
|
|
"mean_token_accuracy": 0.15891481786966324,
|
|
"num_tokens": 17934436.0,
|
|
"step": 9715
|
|
},
|
|
{
|
|
"entropy": 5.697312355041504,
|
|
"epoch": 0.8166351606805293,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004939671292767915,
|
|
"loss": 5.5631,
|
|
"mean_token_accuracy": 0.16518739312887193,
|
|
"num_tokens": 17942969.0,
|
|
"step": 9720
|
|
},
|
|
{
|
|
"entropy": 5.805132532119751,
|
|
"epoch": 0.8170552404956942,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004939602392018216,
|
|
"loss": 5.6504,
|
|
"mean_token_accuracy": 0.15329954028129578,
|
|
"num_tokens": 17952053.0,
|
|
"step": 9725
|
|
},
|
|
{
|
|
"entropy": 5.761464977264405,
|
|
"epoch": 0.817475320310859,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004939533452480839,
|
|
"loss": 5.6279,
|
|
"mean_token_accuracy": 0.15541652143001555,
|
|
"num_tokens": 17960707.0,
|
|
"step": 9730
|
|
},
|
|
{
|
|
"entropy": 5.756668901443481,
|
|
"epoch": 0.8178954001260239,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004939464474157003,
|
|
"loss": 5.7156,
|
|
"mean_token_accuracy": 0.1402080774307251,
|
|
"num_tokens": 17971035.0,
|
|
"step": 9735
|
|
},
|
|
{
|
|
"entropy": 5.764472818374633,
|
|
"epoch": 0.8183154799411888,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004939395457047932,
|
|
"loss": 5.5814,
|
|
"mean_token_accuracy": 0.15014344453811646,
|
|
"num_tokens": 17980656.0,
|
|
"step": 9740
|
|
},
|
|
{
|
|
"entropy": 5.8357971668243405,
|
|
"epoch": 0.8187355597563537,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004939326401154847,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.15061766505241395,
|
|
"num_tokens": 17990977.0,
|
|
"step": 9745
|
|
},
|
|
{
|
|
"entropy": 5.659277296066284,
|
|
"epoch": 0.8191556395715186,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004939257306478973,
|
|
"loss": 5.6026,
|
|
"mean_token_accuracy": 0.15962347537279128,
|
|
"num_tokens": 18000186.0,
|
|
"step": 9750
|
|
},
|
|
{
|
|
"entropy": 5.66544737815857,
|
|
"epoch": 0.8195757193866835,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004939188173021532,
|
|
"loss": 5.584,
|
|
"mean_token_accuracy": 0.15631651431322097,
|
|
"num_tokens": 18010269.0,
|
|
"step": 9755
|
|
},
|
|
{
|
|
"entropy": 5.734535169601441,
|
|
"epoch": 0.8199957992018484,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004939119000783751,
|
|
"loss": 5.5143,
|
|
"mean_token_accuracy": 0.1653580456972122,
|
|
"num_tokens": 18018461.0,
|
|
"step": 9760
|
|
},
|
|
{
|
|
"entropy": 5.650555562973023,
|
|
"epoch": 0.8204158790170132,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004939049789766855,
|
|
"loss": 5.5355,
|
|
"mean_token_accuracy": 0.15736581087112428,
|
|
"num_tokens": 18027173.0,
|
|
"step": 9765
|
|
},
|
|
{
|
|
"entropy": 5.6493651390075685,
|
|
"epoch": 0.8208359588321781,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004938980539972068,
|
|
"loss": 5.6419,
|
|
"mean_token_accuracy": 0.1530544400215149,
|
|
"num_tokens": 18036791.0,
|
|
"step": 9770
|
|
},
|
|
{
|
|
"entropy": 5.711198854446411,
|
|
"epoch": 0.821256038647343,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004938911251400617,
|
|
"loss": 5.5821,
|
|
"mean_token_accuracy": 0.16347371712327002,
|
|
"num_tokens": 18046908.0,
|
|
"step": 9775
|
|
},
|
|
{
|
|
"entropy": 5.675604200363159,
|
|
"epoch": 0.8216761184625079,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004938841924053731,
|
|
"loss": 5.4868,
|
|
"mean_token_accuracy": 0.16802890747785568,
|
|
"num_tokens": 18055825.0,
|
|
"step": 9780
|
|
},
|
|
{
|
|
"entropy": 5.74740252494812,
|
|
"epoch": 0.8220961982776728,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004938772557932637,
|
|
"loss": 5.6547,
|
|
"mean_token_accuracy": 0.14992306679487227,
|
|
"num_tokens": 18065334.0,
|
|
"step": 9785
|
|
},
|
|
{
|
|
"entropy": 5.804665422439575,
|
|
"epoch": 0.8225162780928377,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004938703153038565,
|
|
"loss": 5.5424,
|
|
"mean_token_accuracy": 0.16053025126457215,
|
|
"num_tokens": 18073999.0,
|
|
"step": 9790
|
|
},
|
|
{
|
|
"entropy": 5.638562202453613,
|
|
"epoch": 0.8229363579080026,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004938633709372744,
|
|
"loss": 5.5667,
|
|
"mean_token_accuracy": 0.1526872143149376,
|
|
"num_tokens": 18083665.0,
|
|
"step": 9795
|
|
},
|
|
{
|
|
"entropy": 5.719447278976441,
|
|
"epoch": 0.8233564377231674,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004938564226936403,
|
|
"loss": 5.5635,
|
|
"mean_token_accuracy": 0.1585657551884651,
|
|
"num_tokens": 18092501.0,
|
|
"step": 9800
|
|
},
|
|
{
|
|
"entropy": 5.707524585723877,
|
|
"epoch": 0.8237765175383323,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004938494705730773,
|
|
"loss": 5.5628,
|
|
"mean_token_accuracy": 0.15374434292316436,
|
|
"num_tokens": 18101320.0,
|
|
"step": 9805
|
|
},
|
|
{
|
|
"entropy": 5.691710615158081,
|
|
"epoch": 0.8241965973534972,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004938425145757087,
|
|
"loss": 5.5892,
|
|
"mean_token_accuracy": 0.15482329502701758,
|
|
"num_tokens": 18110190.0,
|
|
"step": 9810
|
|
},
|
|
{
|
|
"entropy": 5.7161225318908695,
|
|
"epoch": 0.824616677168662,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004938355547016577,
|
|
"loss": 5.578,
|
|
"mean_token_accuracy": 0.15880000442266465,
|
|
"num_tokens": 18119301.0,
|
|
"step": 9815
|
|
},
|
|
{
|
|
"entropy": 5.83074254989624,
|
|
"epoch": 0.825036756983827,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004938285909510474,
|
|
"loss": 5.6304,
|
|
"mean_token_accuracy": 0.15105096995830536,
|
|
"num_tokens": 18128959.0,
|
|
"step": 9820
|
|
},
|
|
{
|
|
"entropy": 5.730278205871582,
|
|
"epoch": 0.8254568367989918,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004938216233240014,
|
|
"loss": 5.5971,
|
|
"mean_token_accuracy": 0.15822493731975557,
|
|
"num_tokens": 18138156.0,
|
|
"step": 9825
|
|
},
|
|
{
|
|
"entropy": 5.770270156860351,
|
|
"epoch": 0.8258769166141567,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000493814651820643,
|
|
"loss": 5.6304,
|
|
"mean_token_accuracy": 0.14718272238969804,
|
|
"num_tokens": 18147244.0,
|
|
"step": 9830
|
|
},
|
|
{
|
|
"entropy": 5.809439325332642,
|
|
"epoch": 0.8262969964293215,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004938076764410956,
|
|
"loss": 5.65,
|
|
"mean_token_accuracy": 0.15126846134662628,
|
|
"num_tokens": 18156040.0,
|
|
"step": 9835
|
|
},
|
|
{
|
|
"entropy": 5.774372339248657,
|
|
"epoch": 0.8267170762444864,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.000493800697185483,
|
|
"loss": 5.5587,
|
|
"mean_token_accuracy": 0.15274341106414796,
|
|
"num_tokens": 18165210.0,
|
|
"step": 9840
|
|
},
|
|
{
|
|
"entropy": 5.755823945999145,
|
|
"epoch": 0.8271371560596513,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004937937140539288,
|
|
"loss": 5.6189,
|
|
"mean_token_accuracy": 0.15705254673957825,
|
|
"num_tokens": 18174841.0,
|
|
"step": 9845
|
|
},
|
|
{
|
|
"entropy": 5.673149156570434,
|
|
"epoch": 0.8275572358748162,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004937867270465564,
|
|
"loss": 5.5197,
|
|
"mean_token_accuracy": 0.15668955445289612,
|
|
"num_tokens": 18184112.0,
|
|
"step": 9850
|
|
},
|
|
{
|
|
"entropy": 5.792433404922486,
|
|
"epoch": 0.8279773156899811,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004937797361634899,
|
|
"loss": 5.7046,
|
|
"mean_token_accuracy": 0.15178792774677277,
|
|
"num_tokens": 18193564.0,
|
|
"step": 9855
|
|
},
|
|
{
|
|
"entropy": 5.670192384719849,
|
|
"epoch": 0.828397395505146,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.000493772741404853,
|
|
"loss": 5.4317,
|
|
"mean_token_accuracy": 0.16718422323465348,
|
|
"num_tokens": 18202836.0,
|
|
"step": 9860
|
|
},
|
|
{
|
|
"entropy": 5.7452069282531735,
|
|
"epoch": 0.8288174753203108,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004937657427707698,
|
|
"loss": 5.5625,
|
|
"mean_token_accuracy": 0.1660207688808441,
|
|
"num_tokens": 18212098.0,
|
|
"step": 9865
|
|
},
|
|
{
|
|
"entropy": 5.751721334457398,
|
|
"epoch": 0.8292375551354757,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004937587402613639,
|
|
"loss": 5.5846,
|
|
"mean_token_accuracy": 0.1579491063952446,
|
|
"num_tokens": 18221541.0,
|
|
"step": 9870
|
|
},
|
|
{
|
|
"entropy": 5.679269027709961,
|
|
"epoch": 0.8296576349506406,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004937517338767597,
|
|
"loss": 5.5911,
|
|
"mean_token_accuracy": 0.15127961188554764,
|
|
"num_tokens": 18231015.0,
|
|
"step": 9875
|
|
},
|
|
{
|
|
"entropy": 5.7905172348022464,
|
|
"epoch": 0.8300777147658055,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004937447236170811,
|
|
"loss": 5.5991,
|
|
"mean_token_accuracy": 0.15796486884355546,
|
|
"num_tokens": 18239729.0,
|
|
"step": 9880
|
|
},
|
|
{
|
|
"entropy": 5.79022216796875,
|
|
"epoch": 0.8304977945809704,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004937377094824523,
|
|
"loss": 5.6634,
|
|
"mean_token_accuracy": 0.15445734858512877,
|
|
"num_tokens": 18249773.0,
|
|
"step": 9885
|
|
},
|
|
{
|
|
"entropy": 5.790219640731811,
|
|
"epoch": 0.8309178743961353,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004937306914729977,
|
|
"loss": 5.6113,
|
|
"mean_token_accuracy": 0.15426182597875596,
|
|
"num_tokens": 18259179.0,
|
|
"step": 9890
|
|
},
|
|
{
|
|
"entropy": 5.654812574386597,
|
|
"epoch": 0.8313379542113002,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004937236695888416,
|
|
"loss": 5.4941,
|
|
"mean_token_accuracy": 0.16756488233804703,
|
|
"num_tokens": 18268164.0,
|
|
"step": 9895
|
|
},
|
|
{
|
|
"entropy": 5.7460182189941404,
|
|
"epoch": 0.831758034026465,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004937166438301082,
|
|
"loss": 5.6649,
|
|
"mean_token_accuracy": 0.15427334159612655,
|
|
"num_tokens": 18276259.0,
|
|
"step": 9900
|
|
},
|
|
{
|
|
"entropy": 5.753793668746948,
|
|
"epoch": 0.8321781138416299,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004937096141969221,
|
|
"loss": 5.6431,
|
|
"mean_token_accuracy": 0.15369066298007966,
|
|
"num_tokens": 18285729.0,
|
|
"step": 9905
|
|
},
|
|
{
|
|
"entropy": 5.808768320083618,
|
|
"epoch": 0.8325981936567948,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004937025806894077,
|
|
"loss": 5.7907,
|
|
"mean_token_accuracy": 0.1430658794939518,
|
|
"num_tokens": 18295873.0,
|
|
"step": 9910
|
|
},
|
|
{
|
|
"entropy": 5.81347017288208,
|
|
"epoch": 0.8330182734719597,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004936955433076899,
|
|
"loss": 5.6261,
|
|
"mean_token_accuracy": 0.16088502556085588,
|
|
"num_tokens": 18305135.0,
|
|
"step": 9915
|
|
},
|
|
{
|
|
"entropy": 5.805698680877685,
|
|
"epoch": 0.8334383532871246,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.000493688502051893,
|
|
"loss": 5.6659,
|
|
"mean_token_accuracy": 0.15734423473477363,
|
|
"num_tokens": 18314251.0,
|
|
"step": 9920
|
|
},
|
|
{
|
|
"entropy": 5.717606830596924,
|
|
"epoch": 0.8338584331022895,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004936814569221421,
|
|
"loss": 5.5011,
|
|
"mean_token_accuracy": 0.1689703121781349,
|
|
"num_tokens": 18322863.0,
|
|
"step": 9925
|
|
},
|
|
{
|
|
"entropy": 5.666426181793213,
|
|
"epoch": 0.8342785129174544,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004936744079185616,
|
|
"loss": 5.5171,
|
|
"mean_token_accuracy": 0.1564257636666298,
|
|
"num_tokens": 18332129.0,
|
|
"step": 9930
|
|
},
|
|
{
|
|
"entropy": 5.784151458740235,
|
|
"epoch": 0.8346985927326191,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004936673550412767,
|
|
"loss": 5.6047,
|
|
"mean_token_accuracy": 0.1550414428114891,
|
|
"num_tokens": 18341457.0,
|
|
"step": 9935
|
|
},
|
|
{
|
|
"entropy": 5.771088027954102,
|
|
"epoch": 0.835118672547784,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000493660298290412,
|
|
"loss": 5.5876,
|
|
"mean_token_accuracy": 0.1493722081184387,
|
|
"num_tokens": 18351397.0,
|
|
"step": 9940
|
|
},
|
|
{
|
|
"entropy": 5.720294952392578,
|
|
"epoch": 0.8355387523629489,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004936532376660929,
|
|
"loss": 5.5374,
|
|
"mean_token_accuracy": 0.1618410602211952,
|
|
"num_tokens": 18360005.0,
|
|
"step": 9945
|
|
},
|
|
{
|
|
"entropy": 5.776373720169067,
|
|
"epoch": 0.8359588321781138,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004936461731684442,
|
|
"loss": 5.6228,
|
|
"mean_token_accuracy": 0.15494077503681183,
|
|
"num_tokens": 18369707.0,
|
|
"step": 9950
|
|
},
|
|
{
|
|
"entropy": 5.7964976787567135,
|
|
"epoch": 0.8363789119932787,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004936391047975912,
|
|
"loss": 5.7554,
|
|
"mean_token_accuracy": 0.15112185776233672,
|
|
"num_tokens": 18379514.0,
|
|
"step": 9955
|
|
},
|
|
{
|
|
"entropy": 5.773888969421387,
|
|
"epoch": 0.8367989918084436,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004936320325536589,
|
|
"loss": 5.4665,
|
|
"mean_token_accuracy": 0.16086758822202682,
|
|
"num_tokens": 18388854.0,
|
|
"step": 9960
|
|
},
|
|
{
|
|
"entropy": 5.768847894668579,
|
|
"epoch": 0.8372190716236085,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004936249564367729,
|
|
"loss": 5.6474,
|
|
"mean_token_accuracy": 0.15842091292142868,
|
|
"num_tokens": 18397806.0,
|
|
"step": 9965
|
|
},
|
|
{
|
|
"entropy": 5.585431623458862,
|
|
"epoch": 0.8376391514387733,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004936178764470583,
|
|
"loss": 5.4896,
|
|
"mean_token_accuracy": 0.1574113130569458,
|
|
"num_tokens": 18406645.0,
|
|
"step": 9970
|
|
},
|
|
{
|
|
"entropy": 5.645328092575073,
|
|
"epoch": 0.8380592312539382,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004936107925846405,
|
|
"loss": 5.5151,
|
|
"mean_token_accuracy": 0.1597437858581543,
|
|
"num_tokens": 18415730.0,
|
|
"step": 9975
|
|
},
|
|
{
|
|
"entropy": 5.784190368652344,
|
|
"epoch": 0.8384793110691031,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004936037048496452,
|
|
"loss": 5.6177,
|
|
"mean_token_accuracy": 0.1599542900919914,
|
|
"num_tokens": 18424638.0,
|
|
"step": 9980
|
|
},
|
|
{
|
|
"entropy": 5.734972286224365,
|
|
"epoch": 0.838899390884268,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004935966132421977,
|
|
"loss": 5.6581,
|
|
"mean_token_accuracy": 0.1531389981508255,
|
|
"num_tokens": 18434090.0,
|
|
"step": 9985
|
|
},
|
|
{
|
|
"entropy": 5.6199877738952635,
|
|
"epoch": 0.8393194706994329,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004935895177624239,
|
|
"loss": 5.5032,
|
|
"mean_token_accuracy": 0.16159379929304124,
|
|
"num_tokens": 18442965.0,
|
|
"step": 9990
|
|
},
|
|
{
|
|
"entropy": 5.781462097167969,
|
|
"epoch": 0.8397395505145978,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004935824184104493,
|
|
"loss": 5.5283,
|
|
"mean_token_accuracy": 0.16171049624681472,
|
|
"num_tokens": 18451553.0,
|
|
"step": 9995
|
|
},
|
|
{
|
|
"entropy": 5.757485055923462,
|
|
"epoch": 0.8401596303297627,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004935753151863997,
|
|
"loss": 5.5918,
|
|
"mean_token_accuracy": 0.1540190264582634,
|
|
"num_tokens": 18461325.0,
|
|
"step": 10000
|
|
},
|
|
{
|
|
"entropy": 5.7339729309082035,
|
|
"epoch": 0.8405797101449275,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004935682080904009,
|
|
"loss": 5.5942,
|
|
"mean_token_accuracy": 0.16424052268266678,
|
|
"num_tokens": 18469977.0,
|
|
"step": 10005
|
|
},
|
|
{
|
|
"entropy": 5.734982681274414,
|
|
"epoch": 0.8409997899600924,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004935610971225789,
|
|
"loss": 5.5382,
|
|
"mean_token_accuracy": 0.15915794372558595,
|
|
"num_tokens": 18479534.0,
|
|
"step": 10010
|
|
},
|
|
{
|
|
"entropy": 5.69389705657959,
|
|
"epoch": 0.8414198697752573,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004935539822830597,
|
|
"loss": 5.6621,
|
|
"mean_token_accuracy": 0.15215058922767638,
|
|
"num_tokens": 18488800.0,
|
|
"step": 10015
|
|
},
|
|
{
|
|
"entropy": 5.745443725585938,
|
|
"epoch": 0.8418399495904222,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000493546863571969,
|
|
"loss": 5.6216,
|
|
"mean_token_accuracy": 0.15481941550970077,
|
|
"num_tokens": 18498083.0,
|
|
"step": 10020
|
|
},
|
|
{
|
|
"entropy": 5.7482555389404295,
|
|
"epoch": 0.8422600294055871,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004935397409894333,
|
|
"loss": 5.5776,
|
|
"mean_token_accuracy": 0.14453350156545638,
|
|
"num_tokens": 18508265.0,
|
|
"step": 10025
|
|
},
|
|
{
|
|
"entropy": 5.769057273864746,
|
|
"epoch": 0.842680109220752,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004935326145355787,
|
|
"loss": 5.6014,
|
|
"mean_token_accuracy": 0.15536364465951918,
|
|
"num_tokens": 18517283.0,
|
|
"step": 10030
|
|
},
|
|
{
|
|
"entropy": 5.758950281143188,
|
|
"epoch": 0.8431001890359168,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004935254842105311,
|
|
"loss": 5.6254,
|
|
"mean_token_accuracy": 0.15739115327596664,
|
|
"num_tokens": 18526482.0,
|
|
"step": 10035
|
|
},
|
|
{
|
|
"entropy": 5.663691616058349,
|
|
"epoch": 0.8435202688510817,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004935183500144173,
|
|
"loss": 5.4506,
|
|
"mean_token_accuracy": 0.1704530283808708,
|
|
"num_tokens": 18536150.0,
|
|
"step": 10040
|
|
},
|
|
{
|
|
"entropy": 5.828098583221435,
|
|
"epoch": 0.8439403486662466,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004935112119473634,
|
|
"loss": 5.6655,
|
|
"mean_token_accuracy": 0.14966737627983093,
|
|
"num_tokens": 18545168.0,
|
|
"step": 10045
|
|
},
|
|
{
|
|
"entropy": 5.721202039718628,
|
|
"epoch": 0.8443604284814115,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004935040700094959,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.15967776775360107,
|
|
"num_tokens": 18553363.0,
|
|
"step": 10050
|
|
},
|
|
{
|
|
"entropy": 5.646053218841553,
|
|
"epoch": 0.8447805082965764,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004934969242009412,
|
|
"loss": 5.5193,
|
|
"mean_token_accuracy": 0.16305817514657975,
|
|
"num_tokens": 18562546.0,
|
|
"step": 10055
|
|
},
|
|
{
|
|
"entropy": 5.729463911056518,
|
|
"epoch": 0.8452005881117413,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004934897745218262,
|
|
"loss": 5.6211,
|
|
"mean_token_accuracy": 0.15121982246637344,
|
|
"num_tokens": 18572149.0,
|
|
"step": 10060
|
|
},
|
|
{
|
|
"entropy": 5.745990800857544,
|
|
"epoch": 0.8456206679269062,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004934826209722772,
|
|
"loss": 5.4757,
|
|
"mean_token_accuracy": 0.16152824461460114,
|
|
"num_tokens": 18580842.0,
|
|
"step": 10065
|
|
},
|
|
{
|
|
"entropy": 5.741084957122803,
|
|
"epoch": 0.8460407477420709,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0004934754635524211,
|
|
"loss": 5.5795,
|
|
"mean_token_accuracy": 0.16007215827703475,
|
|
"num_tokens": 18589765.0,
|
|
"step": 10070
|
|
},
|
|
{
|
|
"entropy": 5.711794137954712,
|
|
"epoch": 0.8464608275572358,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004934683022623847,
|
|
"loss": 5.6083,
|
|
"mean_token_accuracy": 0.15259937196969986,
|
|
"num_tokens": 18599532.0,
|
|
"step": 10075
|
|
},
|
|
{
|
|
"entropy": 5.65915470123291,
|
|
"epoch": 0.8468809073724007,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004934611371022947,
|
|
"loss": 5.5002,
|
|
"mean_token_accuracy": 0.1630965918302536,
|
|
"num_tokens": 18608438.0,
|
|
"step": 10080
|
|
},
|
|
{
|
|
"entropy": 5.7712366580963135,
|
|
"epoch": 0.8473009871875656,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004934539680722783,
|
|
"loss": 5.6625,
|
|
"mean_token_accuracy": 0.15168794989585876,
|
|
"num_tokens": 18617313.0,
|
|
"step": 10085
|
|
},
|
|
{
|
|
"entropy": 5.7000326156616214,
|
|
"epoch": 0.8477210670027305,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004934467951724622,
|
|
"loss": 5.4855,
|
|
"mean_token_accuracy": 0.1603113070130348,
|
|
"num_tokens": 18625880.0,
|
|
"step": 10090
|
|
},
|
|
{
|
|
"entropy": 5.690435123443604,
|
|
"epoch": 0.8481411468178954,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004934396184029737,
|
|
"loss": 5.5692,
|
|
"mean_token_accuracy": 0.15285091400146483,
|
|
"num_tokens": 18635727.0,
|
|
"step": 10095
|
|
},
|
|
{
|
|
"entropy": 5.747970628738403,
|
|
"epoch": 0.8485612266330603,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004934324377639398,
|
|
"loss": 5.621,
|
|
"mean_token_accuracy": 0.15370510891079903,
|
|
"num_tokens": 18645619.0,
|
|
"step": 10100
|
|
},
|
|
{
|
|
"entropy": 5.742952156066894,
|
|
"epoch": 0.8489813064482251,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004934252532554878,
|
|
"loss": 5.5197,
|
|
"mean_token_accuracy": 0.1536563068628311,
|
|
"num_tokens": 18654901.0,
|
|
"step": 10105
|
|
},
|
|
{
|
|
"entropy": 5.799785947799682,
|
|
"epoch": 0.84940138626339,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004934180648777449,
|
|
"loss": 5.7802,
|
|
"mean_token_accuracy": 0.15087857395410537,
|
|
"num_tokens": 18664523.0,
|
|
"step": 10110
|
|
},
|
|
{
|
|
"entropy": 5.763763666152954,
|
|
"epoch": 0.8498214660785549,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004934108726308384,
|
|
"loss": 5.6,
|
|
"mean_token_accuracy": 0.15230715721845628,
|
|
"num_tokens": 18673685.0,
|
|
"step": 10115
|
|
},
|
|
{
|
|
"entropy": 5.713348913192749,
|
|
"epoch": 0.8502415458937198,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004934036765148958,
|
|
"loss": 5.5767,
|
|
"mean_token_accuracy": 0.15448796078562738,
|
|
"num_tokens": 18682889.0,
|
|
"step": 10120
|
|
},
|
|
{
|
|
"entropy": 5.75453748703003,
|
|
"epoch": 0.8506616257088847,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004933964765300446,
|
|
"loss": 5.6178,
|
|
"mean_token_accuracy": 0.1548921898007393,
|
|
"num_tokens": 18692978.0,
|
|
"step": 10125
|
|
},
|
|
{
|
|
"entropy": 5.711153602600097,
|
|
"epoch": 0.8510817055240496,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.000493389272676412,
|
|
"loss": 5.5298,
|
|
"mean_token_accuracy": 0.15920393615961076,
|
|
"num_tokens": 18701846.0,
|
|
"step": 10130
|
|
},
|
|
{
|
|
"entropy": 5.671429061889649,
|
|
"epoch": 0.8515017853392145,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004933820649541262,
|
|
"loss": 5.5761,
|
|
"mean_token_accuracy": 0.16145771741867065,
|
|
"num_tokens": 18711492.0,
|
|
"step": 10135
|
|
},
|
|
{
|
|
"entropy": 5.657542657852173,
|
|
"epoch": 0.8519218651543793,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004933748533633145,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.16596206575632094,
|
|
"num_tokens": 18720407.0,
|
|
"step": 10140
|
|
},
|
|
{
|
|
"entropy": 5.707644414901734,
|
|
"epoch": 0.8523419449695442,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004933676379041045,
|
|
"loss": 5.5525,
|
|
"mean_token_accuracy": 0.16160011291503906,
|
|
"num_tokens": 18729968.0,
|
|
"step": 10145
|
|
},
|
|
{
|
|
"entropy": 5.740623044967651,
|
|
"epoch": 0.8527620247847091,
|
|
"grad_norm": 1.3046875,
|
|
"learning_rate": 0.0004933604185766245,
|
|
"loss": 5.6485,
|
|
"mean_token_accuracy": 0.1480335585772991,
|
|
"num_tokens": 18739525.0,
|
|
"step": 10150
|
|
},
|
|
{
|
|
"entropy": 5.733673620223999,
|
|
"epoch": 0.853182104599874,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004933531953810019,
|
|
"loss": 5.5518,
|
|
"mean_token_accuracy": 0.16111933290958405,
|
|
"num_tokens": 18749087.0,
|
|
"step": 10155
|
|
},
|
|
{
|
|
"entropy": 5.77870192527771,
|
|
"epoch": 0.8536021844150389,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004933459683173652,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.1587308406829834,
|
|
"num_tokens": 18758174.0,
|
|
"step": 10160
|
|
},
|
|
{
|
|
"entropy": 5.752344226837158,
|
|
"epoch": 0.8540222642302038,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004933387373858418,
|
|
"loss": 5.6233,
|
|
"mean_token_accuracy": 0.15292344242334366,
|
|
"num_tokens": 18767679.0,
|
|
"step": 10165
|
|
},
|
|
{
|
|
"entropy": 5.755201578140259,
|
|
"epoch": 0.8544423440453687,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004933315025865602,
|
|
"loss": 5.5476,
|
|
"mean_token_accuracy": 0.15560670644044877,
|
|
"num_tokens": 18776749.0,
|
|
"step": 10170
|
|
},
|
|
{
|
|
"entropy": 5.797035074234008,
|
|
"epoch": 0.8548624238605335,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004933242639196485,
|
|
"loss": 5.7157,
|
|
"mean_token_accuracy": 0.14426579549908639,
|
|
"num_tokens": 18786313.0,
|
|
"step": 10175
|
|
},
|
|
{
|
|
"entropy": 5.774004793167114,
|
|
"epoch": 0.8552825036756984,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004933170213852348,
|
|
"loss": 5.6112,
|
|
"mean_token_accuracy": 0.1511244609951973,
|
|
"num_tokens": 18795340.0,
|
|
"step": 10180
|
|
},
|
|
{
|
|
"entropy": 5.71902289390564,
|
|
"epoch": 0.8557025834908633,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004933097749834476,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.1552624970674515,
|
|
"num_tokens": 18804114.0,
|
|
"step": 10185
|
|
},
|
|
{
|
|
"entropy": 5.686375379562378,
|
|
"epoch": 0.8561226633060282,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000493302524714415,
|
|
"loss": 5.5489,
|
|
"mean_token_accuracy": 0.15422181487083436,
|
|
"num_tokens": 18813797.0,
|
|
"step": 10190
|
|
},
|
|
{
|
|
"entropy": 5.722941493988037,
|
|
"epoch": 0.856542743121193,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004932952705782657,
|
|
"loss": 5.5966,
|
|
"mean_token_accuracy": 0.15367913916707038,
|
|
"num_tokens": 18822410.0,
|
|
"step": 10195
|
|
},
|
|
{
|
|
"entropy": 5.683314323425293,
|
|
"epoch": 0.856962822936358,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000493288012575128,
|
|
"loss": 5.5436,
|
|
"mean_token_accuracy": 0.1596749320626259,
|
|
"num_tokens": 18832091.0,
|
|
"step": 10200
|
|
},
|
|
{
|
|
"entropy": 5.7361071586608885,
|
|
"epoch": 0.8573829027515227,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004932807507051307,
|
|
"loss": 5.5526,
|
|
"mean_token_accuracy": 0.15038529485464097,
|
|
"num_tokens": 18841298.0,
|
|
"step": 10205
|
|
},
|
|
{
|
|
"entropy": 5.656687879562378,
|
|
"epoch": 0.8578029825666876,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004932734849684022,
|
|
"loss": 5.5347,
|
|
"mean_token_accuracy": 0.1560280829668045,
|
|
"num_tokens": 18849683.0,
|
|
"step": 10210
|
|
},
|
|
{
|
|
"entropy": 5.711640548706055,
|
|
"epoch": 0.8582230623818525,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004932662153650712,
|
|
"loss": 5.4793,
|
|
"mean_token_accuracy": 0.15597545802593232,
|
|
"num_tokens": 18858832.0,
|
|
"step": 10215
|
|
},
|
|
{
|
|
"entropy": 5.655262565612793,
|
|
"epoch": 0.8586431421970174,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004932589418952668,
|
|
"loss": 5.5174,
|
|
"mean_token_accuracy": 0.1572958916425705,
|
|
"num_tokens": 18867652.0,
|
|
"step": 10220
|
|
},
|
|
{
|
|
"entropy": 5.753128433227539,
|
|
"epoch": 0.8590632220121823,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004932516645591175,
|
|
"loss": 5.5962,
|
|
"mean_token_accuracy": 0.15835005044937134,
|
|
"num_tokens": 18877282.0,
|
|
"step": 10225
|
|
},
|
|
{
|
|
"entropy": 5.794670820236206,
|
|
"epoch": 0.8594833018273472,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004932443833567524,
|
|
"loss": 5.731,
|
|
"mean_token_accuracy": 0.15675377175211908,
|
|
"num_tokens": 18886565.0,
|
|
"step": 10230
|
|
},
|
|
{
|
|
"entropy": 5.803075695037842,
|
|
"epoch": 0.8599033816425121,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004932370982883003,
|
|
"loss": 5.6298,
|
|
"mean_token_accuracy": 0.15467827767133713,
|
|
"num_tokens": 18896440.0,
|
|
"step": 10235
|
|
},
|
|
{
|
|
"entropy": 5.814779329299927,
|
|
"epoch": 0.8603234614576769,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004932298093538905,
|
|
"loss": 5.6525,
|
|
"mean_token_accuracy": 0.15493873208761216,
|
|
"num_tokens": 18906246.0,
|
|
"step": 10240
|
|
},
|
|
{
|
|
"entropy": 5.695867204666138,
|
|
"epoch": 0.8607435412728418,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.000493222516553652,
|
|
"loss": 5.5642,
|
|
"mean_token_accuracy": 0.1503233715891838,
|
|
"num_tokens": 18915108.0,
|
|
"step": 10245
|
|
},
|
|
{
|
|
"entropy": 5.709557580947876,
|
|
"epoch": 0.8611636210880067,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004932152198877139,
|
|
"loss": 5.5607,
|
|
"mean_token_accuracy": 0.16035114824771882,
|
|
"num_tokens": 18923664.0,
|
|
"step": 10250
|
|
},
|
|
{
|
|
"entropy": 5.74764666557312,
|
|
"epoch": 0.8615837009031716,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004932079193562057,
|
|
"loss": 5.6686,
|
|
"mean_token_accuracy": 0.1497301295399666,
|
|
"num_tokens": 18933496.0,
|
|
"step": 10255
|
|
},
|
|
{
|
|
"entropy": 5.679797077178955,
|
|
"epoch": 0.8620037807183365,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004932006149592564,
|
|
"loss": 5.5357,
|
|
"mean_token_accuracy": 0.1623591274023056,
|
|
"num_tokens": 18942222.0,
|
|
"step": 10260
|
|
},
|
|
{
|
|
"entropy": 5.8157978534698485,
|
|
"epoch": 0.8624238605335014,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004931933066969957,
|
|
"loss": 5.5669,
|
|
"mean_token_accuracy": 0.15557005107402802,
|
|
"num_tokens": 18952057.0,
|
|
"step": 10265
|
|
},
|
|
{
|
|
"entropy": 5.7592529296875,
|
|
"epoch": 0.8628439403486663,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004931859945695528,
|
|
"loss": 5.6168,
|
|
"mean_token_accuracy": 0.1521659165620804,
|
|
"num_tokens": 18961664.0,
|
|
"step": 10270
|
|
},
|
|
{
|
|
"entropy": 5.632269430160522,
|
|
"epoch": 0.8632640201638311,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004931786785770575,
|
|
"loss": 5.3931,
|
|
"mean_token_accuracy": 0.1726113513112068,
|
|
"num_tokens": 18969900.0,
|
|
"step": 10275
|
|
},
|
|
{
|
|
"entropy": 5.746176385879517,
|
|
"epoch": 0.863684099978996,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0004931713587196392,
|
|
"loss": 5.6827,
|
|
"mean_token_accuracy": 0.15060218945145606,
|
|
"num_tokens": 18979286.0,
|
|
"step": 10280
|
|
},
|
|
{
|
|
"entropy": 5.798988389968872,
|
|
"epoch": 0.8641041797941609,
|
|
"grad_norm": 1.265625,
|
|
"learning_rate": 0.0004931640349974275,
|
|
"loss": 5.5713,
|
|
"mean_token_accuracy": 0.15988062992691993,
|
|
"num_tokens": 18987553.0,
|
|
"step": 10285
|
|
},
|
|
{
|
|
"entropy": 5.792628335952759,
|
|
"epoch": 0.8645242596093258,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004931567074105524,
|
|
"loss": 5.658,
|
|
"mean_token_accuracy": 0.15038609951734544,
|
|
"num_tokens": 18996354.0,
|
|
"step": 10290
|
|
},
|
|
{
|
|
"entropy": 5.680066680908203,
|
|
"epoch": 0.8649443394244907,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004931493759591435,
|
|
"loss": 5.5531,
|
|
"mean_token_accuracy": 0.15763965249061584,
|
|
"num_tokens": 19005150.0,
|
|
"step": 10295
|
|
},
|
|
{
|
|
"entropy": 5.721858263015747,
|
|
"epoch": 0.8653644192396556,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004931420406433308,
|
|
"loss": 5.5557,
|
|
"mean_token_accuracy": 0.1564570590853691,
|
|
"num_tokens": 19014572.0,
|
|
"step": 10300
|
|
},
|
|
{
|
|
"entropy": 5.70249376296997,
|
|
"epoch": 0.8657844990548205,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.000493134701463244,
|
|
"loss": 5.4212,
|
|
"mean_token_accuracy": 0.1627628058195114,
|
|
"num_tokens": 19023462.0,
|
|
"step": 10305
|
|
},
|
|
{
|
|
"entropy": 5.612477827072143,
|
|
"epoch": 0.8662045788699853,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004931273584190135,
|
|
"loss": 5.5076,
|
|
"mean_token_accuracy": 0.15907565802335738,
|
|
"num_tokens": 19032460.0,
|
|
"step": 10310
|
|
},
|
|
{
|
|
"entropy": 5.660311269760132,
|
|
"epoch": 0.8666246586851502,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004931200115107691,
|
|
"loss": 5.5494,
|
|
"mean_token_accuracy": 0.16178966760635377,
|
|
"num_tokens": 19041734.0,
|
|
"step": 10315
|
|
},
|
|
{
|
|
"entropy": 5.7060261249542235,
|
|
"epoch": 0.867044738500315,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.000493112660738641,
|
|
"loss": 5.5361,
|
|
"mean_token_accuracy": 0.15414502024650573,
|
|
"num_tokens": 19050867.0,
|
|
"step": 10320
|
|
},
|
|
{
|
|
"entropy": 5.722083282470703,
|
|
"epoch": 0.86746481831548,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004931053061027594,
|
|
"loss": 5.5354,
|
|
"mean_token_accuracy": 0.156645068526268,
|
|
"num_tokens": 19060518.0,
|
|
"step": 10325
|
|
},
|
|
{
|
|
"entropy": 5.697010898590088,
|
|
"epoch": 0.8678848981306448,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.0004930979476032546,
|
|
"loss": 5.5213,
|
|
"mean_token_accuracy": 0.16217473298311233,
|
|
"num_tokens": 19069588.0,
|
|
"step": 10330
|
|
},
|
|
{
|
|
"entropy": 5.727666282653809,
|
|
"epoch": 0.8683049779458097,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000493090585240257,
|
|
"loss": 5.5772,
|
|
"mean_token_accuracy": 0.14466141611337663,
|
|
"num_tokens": 19079060.0,
|
|
"step": 10335
|
|
},
|
|
{
|
|
"entropy": 5.64642915725708,
|
|
"epoch": 0.8687250577609746,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004930832190138969,
|
|
"loss": 5.4841,
|
|
"mean_token_accuracy": 0.15521228462457656,
|
|
"num_tokens": 19087721.0,
|
|
"step": 10340
|
|
},
|
|
{
|
|
"entropy": 5.668660879135132,
|
|
"epoch": 0.8691451375761394,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.000493075848924305,
|
|
"loss": 5.5447,
|
|
"mean_token_accuracy": 0.15732968896627425,
|
|
"num_tokens": 19096800.0,
|
|
"step": 10345
|
|
},
|
|
{
|
|
"entropy": 5.728336191177368,
|
|
"epoch": 0.8695652173913043,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004930684749716117,
|
|
"loss": 5.5913,
|
|
"mean_token_accuracy": 0.15400783568620682,
|
|
"num_tokens": 19106774.0,
|
|
"step": 10350
|
|
},
|
|
{
|
|
"entropy": 5.756087446212769,
|
|
"epoch": 0.8699852972064692,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004930610971559476,
|
|
"loss": 5.5439,
|
|
"mean_token_accuracy": 0.1576864629983902,
|
|
"num_tokens": 19116413.0,
|
|
"step": 10355
|
|
},
|
|
{
|
|
"entropy": 5.752991104125977,
|
|
"epoch": 0.8704053770216341,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004930537154774436,
|
|
"loss": 5.5522,
|
|
"mean_token_accuracy": 0.15667675733566283,
|
|
"num_tokens": 19125363.0,
|
|
"step": 10360
|
|
},
|
|
{
|
|
"entropy": 5.729405260086059,
|
|
"epoch": 0.870825456836799,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004930463299362302,
|
|
"loss": 5.6603,
|
|
"mean_token_accuracy": 0.1461710177361965,
|
|
"num_tokens": 19135461.0,
|
|
"step": 10365
|
|
},
|
|
{
|
|
"entropy": 5.793264532089234,
|
|
"epoch": 0.8712455366519639,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004930389405324383,
|
|
"loss": 5.5182,
|
|
"mean_token_accuracy": 0.16262105852365494,
|
|
"num_tokens": 19144085.0,
|
|
"step": 10370
|
|
},
|
|
{
|
|
"entropy": 5.733283281326294,
|
|
"epoch": 0.8716656164671287,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004930315472661987,
|
|
"loss": 5.5368,
|
|
"mean_token_accuracy": 0.16369505524635314,
|
|
"num_tokens": 19153291.0,
|
|
"step": 10375
|
|
},
|
|
{
|
|
"entropy": 5.709996700286865,
|
|
"epoch": 0.8720856962822936,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004930241501376428,
|
|
"loss": 5.5524,
|
|
"mean_token_accuracy": 0.1532418116927147,
|
|
"num_tokens": 19163514.0,
|
|
"step": 10380
|
|
},
|
|
{
|
|
"entropy": 5.618998575210571,
|
|
"epoch": 0.8725057760974585,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004930167491469013,
|
|
"loss": 5.447,
|
|
"mean_token_accuracy": 0.16484934836626053,
|
|
"num_tokens": 19172103.0,
|
|
"step": 10385
|
|
},
|
|
{
|
|
"entropy": 5.679058408737182,
|
|
"epoch": 0.8729258559126234,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004930093442941053,
|
|
"loss": 5.5135,
|
|
"mean_token_accuracy": 0.15935876667499543,
|
|
"num_tokens": 19180893.0,
|
|
"step": 10390
|
|
},
|
|
{
|
|
"entropy": 5.665105485916138,
|
|
"epoch": 0.8733459357277883,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004930019355793858,
|
|
"loss": 5.4451,
|
|
"mean_token_accuracy": 0.16085383147001267,
|
|
"num_tokens": 19190495.0,
|
|
"step": 10395
|
|
},
|
|
{
|
|
"entropy": 5.698199129104614,
|
|
"epoch": 0.8737660155429532,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004929945230028746,
|
|
"loss": 5.544,
|
|
"mean_token_accuracy": 0.1626562222838402,
|
|
"num_tokens": 19198988.0,
|
|
"step": 10400
|
|
},
|
|
{
|
|
"entropy": 5.6585792064666744,
|
|
"epoch": 0.8741860953581181,
|
|
"grad_norm": 2.90625,
|
|
"learning_rate": 0.0004929871065647024,
|
|
"loss": 5.4557,
|
|
"mean_token_accuracy": 0.16211153268814088,
|
|
"num_tokens": 19208014.0,
|
|
"step": 10405
|
|
},
|
|
{
|
|
"entropy": 5.7339530944824215,
|
|
"epoch": 0.8746061751732829,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004929796862650011,
|
|
"loss": 5.6421,
|
|
"mean_token_accuracy": 0.1563207045197487,
|
|
"num_tokens": 19218220.0,
|
|
"step": 10410
|
|
},
|
|
{
|
|
"entropy": 5.693988180160522,
|
|
"epoch": 0.8750262549884478,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004929722621039018,
|
|
"loss": 5.5251,
|
|
"mean_token_accuracy": 0.1552248328924179,
|
|
"num_tokens": 19227176.0,
|
|
"step": 10415
|
|
},
|
|
{
|
|
"entropy": 5.644034004211425,
|
|
"epoch": 0.8754463348036127,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004929648340815362,
|
|
"loss": 5.5651,
|
|
"mean_token_accuracy": 0.15514406561851501,
|
|
"num_tokens": 19236085.0,
|
|
"step": 10420
|
|
},
|
|
{
|
|
"entropy": 5.694051218032837,
|
|
"epoch": 0.8758664146187776,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004929574021980355,
|
|
"loss": 5.6118,
|
|
"mean_token_accuracy": 0.15097474902868271,
|
|
"num_tokens": 19246671.0,
|
|
"step": 10425
|
|
},
|
|
{
|
|
"entropy": 5.706706523895264,
|
|
"epoch": 0.8762864944339425,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004929499664535319,
|
|
"loss": 5.5254,
|
|
"mean_token_accuracy": 0.1542436182498932,
|
|
"num_tokens": 19256321.0,
|
|
"step": 10430
|
|
},
|
|
{
|
|
"entropy": 5.661406517028809,
|
|
"epoch": 0.8767065742491074,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004929425268481569,
|
|
"loss": 5.4778,
|
|
"mean_token_accuracy": 0.16220592856407165,
|
|
"num_tokens": 19265518.0,
|
|
"step": 10435
|
|
},
|
|
{
|
|
"entropy": 5.673083209991455,
|
|
"epoch": 0.8771266540642723,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004929350833820422,
|
|
"loss": 5.4903,
|
|
"mean_token_accuracy": 0.16280142813920975,
|
|
"num_tokens": 19274120.0,
|
|
"step": 10440
|
|
},
|
|
{
|
|
"entropy": 5.7643969535827635,
|
|
"epoch": 0.877546733879437,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004929276360553197,
|
|
"loss": 5.5457,
|
|
"mean_token_accuracy": 0.16031394377350808,
|
|
"num_tokens": 19284377.0,
|
|
"step": 10445
|
|
},
|
|
{
|
|
"entropy": 5.61663851737976,
|
|
"epoch": 0.8779668136946019,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.0004929201848681213,
|
|
"loss": 5.4383,
|
|
"mean_token_accuracy": 0.15702698826789857,
|
|
"num_tokens": 19293326.0,
|
|
"step": 10450
|
|
},
|
|
{
|
|
"entropy": 5.622338199615479,
|
|
"epoch": 0.8783868935097668,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004929127298205792,
|
|
"loss": 5.4719,
|
|
"mean_token_accuracy": 0.16489966809749604,
|
|
"num_tokens": 19302086.0,
|
|
"step": 10455
|
|
},
|
|
{
|
|
"entropy": 5.729528093338013,
|
|
"epoch": 0.8788069733249317,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004929052709128251,
|
|
"loss": 5.5196,
|
|
"mean_token_accuracy": 0.16125201433897018,
|
|
"num_tokens": 19310124.0,
|
|
"step": 10460
|
|
},
|
|
{
|
|
"entropy": 5.654933595657349,
|
|
"epoch": 0.8792270531400966,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004928978081449914,
|
|
"loss": 5.5387,
|
|
"mean_token_accuracy": 0.15293454825878144,
|
|
"num_tokens": 19321269.0,
|
|
"step": 10465
|
|
},
|
|
{
|
|
"entropy": 5.675232601165772,
|
|
"epoch": 0.8796471329552615,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004928903415172103,
|
|
"loss": 5.5512,
|
|
"mean_token_accuracy": 0.15898366868495942,
|
|
"num_tokens": 19330390.0,
|
|
"step": 10470
|
|
},
|
|
{
|
|
"entropy": 5.7150938510894775,
|
|
"epoch": 0.8800672127704264,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.000492882871029614,
|
|
"loss": 5.5357,
|
|
"mean_token_accuracy": 0.16283403784036637,
|
|
"num_tokens": 19339457.0,
|
|
"step": 10475
|
|
},
|
|
{
|
|
"entropy": 5.688582849502564,
|
|
"epoch": 0.8804872925855912,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004928753966823348,
|
|
"loss": 5.6011,
|
|
"mean_token_accuracy": 0.15648485273122786,
|
|
"num_tokens": 19348710.0,
|
|
"step": 10480
|
|
},
|
|
{
|
|
"entropy": 5.686913347244262,
|
|
"epoch": 0.8809073724007561,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004928679184755051,
|
|
"loss": 5.6302,
|
|
"mean_token_accuracy": 0.15994774252176286,
|
|
"num_tokens": 19357215.0,
|
|
"step": 10485
|
|
},
|
|
{
|
|
"entropy": 5.690790605545044,
|
|
"epoch": 0.881327452215921,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004928604364092574,
|
|
"loss": 5.5702,
|
|
"mean_token_accuracy": 0.16070739924907684,
|
|
"num_tokens": 19366043.0,
|
|
"step": 10490
|
|
},
|
|
{
|
|
"entropy": 5.751299858093262,
|
|
"epoch": 0.8817475320310859,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004928529504837243,
|
|
"loss": 5.6335,
|
|
"mean_token_accuracy": 0.15246885567903518,
|
|
"num_tokens": 19375468.0,
|
|
"step": 10495
|
|
},
|
|
{
|
|
"entropy": 5.7539137840271,
|
|
"epoch": 0.8821676118462508,
|
|
"grad_norm": 1.328125,
|
|
"learning_rate": 0.0004928454606990383,
|
|
"loss": 5.5061,
|
|
"mean_token_accuracy": 0.16103250831365584,
|
|
"num_tokens": 19384467.0,
|
|
"step": 10500
|
|
},
|
|
{
|
|
"entropy": 5.682570075988769,
|
|
"epoch": 0.8825876916614157,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004928379670553322,
|
|
"loss": 5.5632,
|
|
"mean_token_accuracy": 0.1597358301281929,
|
|
"num_tokens": 19393618.0,
|
|
"step": 10505
|
|
},
|
|
{
|
|
"entropy": 5.729544639587402,
|
|
"epoch": 0.8830077714765806,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004928304695527387,
|
|
"loss": 5.609,
|
|
"mean_token_accuracy": 0.15516545474529267,
|
|
"num_tokens": 19402921.0,
|
|
"step": 10510
|
|
},
|
|
{
|
|
"entropy": 5.759986829757691,
|
|
"epoch": 0.8834278512917454,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004928229681913905,
|
|
"loss": 5.5892,
|
|
"mean_token_accuracy": 0.15613035261631011,
|
|
"num_tokens": 19412048.0,
|
|
"step": 10515
|
|
},
|
|
{
|
|
"entropy": 5.765500164031982,
|
|
"epoch": 0.8838479311069103,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004928154629714207,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.15744251161813735,
|
|
"num_tokens": 19420993.0,
|
|
"step": 10520
|
|
},
|
|
{
|
|
"entropy": 5.685095119476318,
|
|
"epoch": 0.8842680109220752,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000492807953892962,
|
|
"loss": 5.5523,
|
|
"mean_token_accuracy": 0.15811701267957687,
|
|
"num_tokens": 19430145.0,
|
|
"step": 10525
|
|
},
|
|
{
|
|
"entropy": 5.702225112915039,
|
|
"epoch": 0.8846880907372401,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004928004409561476,
|
|
"loss": 5.4636,
|
|
"mean_token_accuracy": 0.1651538133621216,
|
|
"num_tokens": 19438918.0,
|
|
"step": 10530
|
|
},
|
|
{
|
|
"entropy": 5.668575286865234,
|
|
"epoch": 0.885108170552405,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004927929241611106,
|
|
"loss": 5.518,
|
|
"mean_token_accuracy": 0.16605824753642082,
|
|
"num_tokens": 19448490.0,
|
|
"step": 10535
|
|
},
|
|
{
|
|
"entropy": 5.707067775726318,
|
|
"epoch": 0.8855282503675699,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.000492785403507984,
|
|
"loss": 5.5652,
|
|
"mean_token_accuracy": 0.15584489703178406,
|
|
"num_tokens": 19457098.0,
|
|
"step": 10540
|
|
},
|
|
{
|
|
"entropy": 5.697981834411621,
|
|
"epoch": 0.8859483301827347,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004927778789969012,
|
|
"loss": 5.5594,
|
|
"mean_token_accuracy": 0.1538078561425209,
|
|
"num_tokens": 19466419.0,
|
|
"step": 10545
|
|
},
|
|
{
|
|
"entropy": 5.703227710723877,
|
|
"epoch": 0.8863684099978996,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004927703506279955,
|
|
"loss": 5.6216,
|
|
"mean_token_accuracy": 0.15174467712640763,
|
|
"num_tokens": 19475882.0,
|
|
"step": 10550
|
|
},
|
|
{
|
|
"entropy": 5.798790454864502,
|
|
"epoch": 0.8867884898130645,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004927628184014,
|
|
"loss": 5.6533,
|
|
"mean_token_accuracy": 0.1536669135093689,
|
|
"num_tokens": 19485917.0,
|
|
"step": 10555
|
|
},
|
|
{
|
|
"entropy": 5.729129076004028,
|
|
"epoch": 0.8872085696282294,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004927552823172483,
|
|
"loss": 5.5768,
|
|
"mean_token_accuracy": 0.15445511490106584,
|
|
"num_tokens": 19494984.0,
|
|
"step": 10560
|
|
},
|
|
{
|
|
"entropy": 5.739677906036377,
|
|
"epoch": 0.8876286494433943,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000492747742375674,
|
|
"loss": 5.5166,
|
|
"mean_token_accuracy": 0.16236048340797424,
|
|
"num_tokens": 19504087.0,
|
|
"step": 10565
|
|
},
|
|
{
|
|
"entropy": 5.720370864868164,
|
|
"epoch": 0.8880487292585592,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004927401985768106,
|
|
"loss": 5.5661,
|
|
"mean_token_accuracy": 0.16260765492916107,
|
|
"num_tokens": 19512880.0,
|
|
"step": 10570
|
|
},
|
|
{
|
|
"entropy": 5.652678537368774,
|
|
"epoch": 0.888468809073724,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004927326509207915,
|
|
"loss": 5.5515,
|
|
"mean_token_accuracy": 0.1601003259420395,
|
|
"num_tokens": 19521723.0,
|
|
"step": 10575
|
|
},
|
|
{
|
|
"entropy": 5.738314771652222,
|
|
"epoch": 0.8888888888888888,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004927250994077508,
|
|
"loss": 5.6127,
|
|
"mean_token_accuracy": 0.15562157332897186,
|
|
"num_tokens": 19531352.0,
|
|
"step": 10580
|
|
},
|
|
{
|
|
"entropy": 5.794875717163086,
|
|
"epoch": 0.8893089687040537,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.000492717544037822,
|
|
"loss": 5.7321,
|
|
"mean_token_accuracy": 0.15739693194627763,
|
|
"num_tokens": 19540943.0,
|
|
"step": 10585
|
|
},
|
|
{
|
|
"entropy": 5.683047342300415,
|
|
"epoch": 0.8897290485192186,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.000492709984811139,
|
|
"loss": 5.4835,
|
|
"mean_token_accuracy": 0.16288544684648515,
|
|
"num_tokens": 19550527.0,
|
|
"step": 10590
|
|
},
|
|
{
|
|
"entropy": 5.669373846054077,
|
|
"epoch": 0.8901491283343835,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004927024217278358,
|
|
"loss": 5.4758,
|
|
"mean_token_accuracy": 0.16461390554904937,
|
|
"num_tokens": 19559746.0,
|
|
"step": 10595
|
|
},
|
|
{
|
|
"entropy": 5.729438066482544,
|
|
"epoch": 0.8905692081495484,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004926948547880462,
|
|
"loss": 5.636,
|
|
"mean_token_accuracy": 0.1517734244465828,
|
|
"num_tokens": 19569286.0,
|
|
"step": 10600
|
|
},
|
|
{
|
|
"entropy": 5.695087623596192,
|
|
"epoch": 0.8909892879647133,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004926872839919044,
|
|
"loss": 5.5164,
|
|
"mean_token_accuracy": 0.15850782245397568,
|
|
"num_tokens": 19578245.0,
|
|
"step": 10605
|
|
},
|
|
{
|
|
"entropy": 5.637440395355225,
|
|
"epoch": 0.8914093677798782,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004926797093395446,
|
|
"loss": 5.4863,
|
|
"mean_token_accuracy": 0.16317880302667617,
|
|
"num_tokens": 19587244.0,
|
|
"step": 10610
|
|
},
|
|
{
|
|
"entropy": 5.670547389984131,
|
|
"epoch": 0.891829447595043,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004926721308311006,
|
|
"loss": 5.5628,
|
|
"mean_token_accuracy": 0.16251896917819977,
|
|
"num_tokens": 19596932.0,
|
|
"step": 10615
|
|
},
|
|
{
|
|
"entropy": 5.772653007507325,
|
|
"epoch": 0.8922495274102079,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004926645484667069,
|
|
"loss": 5.6799,
|
|
"mean_token_accuracy": 0.15312184244394303,
|
|
"num_tokens": 19606256.0,
|
|
"step": 10620
|
|
},
|
|
{
|
|
"entropy": 5.808226013183594,
|
|
"epoch": 0.8926696072253728,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004926569622464979,
|
|
"loss": 5.6835,
|
|
"mean_token_accuracy": 0.1561840444803238,
|
|
"num_tokens": 19615726.0,
|
|
"step": 10625
|
|
},
|
|
{
|
|
"entropy": 5.75749716758728,
|
|
"epoch": 0.8930896870405377,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004926493721706079,
|
|
"loss": 5.5446,
|
|
"mean_token_accuracy": 0.15656498074531555,
|
|
"num_tokens": 19624037.0,
|
|
"step": 10630
|
|
},
|
|
{
|
|
"entropy": 5.733023357391358,
|
|
"epoch": 0.8935097668557026,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004926417782391713,
|
|
"loss": 5.5234,
|
|
"mean_token_accuracy": 0.16201919168233872,
|
|
"num_tokens": 19632882.0,
|
|
"step": 10635
|
|
},
|
|
{
|
|
"entropy": 5.693700790405273,
|
|
"epoch": 0.8939298466708675,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004926341804523227,
|
|
"loss": 5.6482,
|
|
"mean_token_accuracy": 0.15508821457624436,
|
|
"num_tokens": 19642686.0,
|
|
"step": 10640
|
|
},
|
|
{
|
|
"entropy": 5.671689987182617,
|
|
"epoch": 0.8943499264860324,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004926265788101966,
|
|
"loss": 5.5437,
|
|
"mean_token_accuracy": 0.1606654331088066,
|
|
"num_tokens": 19651380.0,
|
|
"step": 10645
|
|
},
|
|
{
|
|
"entropy": 5.678468942642212,
|
|
"epoch": 0.8947700063011972,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004926189733129278,
|
|
"loss": 5.4821,
|
|
"mean_token_accuracy": 0.159104885160923,
|
|
"num_tokens": 19660136.0,
|
|
"step": 10650
|
|
},
|
|
{
|
|
"entropy": 5.646151065826416,
|
|
"epoch": 0.8951900861163621,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004926113639606509,
|
|
"loss": 5.525,
|
|
"mean_token_accuracy": 0.16899669989943505,
|
|
"num_tokens": 19669146.0,
|
|
"step": 10655
|
|
},
|
|
{
|
|
"entropy": 5.76297607421875,
|
|
"epoch": 0.895610165931527,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004926037507535008,
|
|
"loss": 5.6574,
|
|
"mean_token_accuracy": 0.15518373548984526,
|
|
"num_tokens": 19678627.0,
|
|
"step": 10660
|
|
},
|
|
{
|
|
"entropy": 5.736645460128784,
|
|
"epoch": 0.8960302457466919,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004925961336916122,
|
|
"loss": 5.5778,
|
|
"mean_token_accuracy": 0.15911105573177337,
|
|
"num_tokens": 19688033.0,
|
|
"step": 10665
|
|
},
|
|
{
|
|
"entropy": 5.739694452285766,
|
|
"epoch": 0.8964503255618568,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004925885127751202,
|
|
"loss": 5.5968,
|
|
"mean_token_accuracy": 0.16282687336206436,
|
|
"num_tokens": 19696523.0,
|
|
"step": 10670
|
|
},
|
|
{
|
|
"entropy": 5.778036117553711,
|
|
"epoch": 0.8968704053770217,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004925808880041596,
|
|
"loss": 5.5251,
|
|
"mean_token_accuracy": 0.15527614951133728,
|
|
"num_tokens": 19706339.0,
|
|
"step": 10675
|
|
},
|
|
{
|
|
"entropy": 5.7975170612335205,
|
|
"epoch": 0.8972904851921865,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004925732593788658,
|
|
"loss": 5.5636,
|
|
"mean_token_accuracy": 0.15571669340133668,
|
|
"num_tokens": 19714779.0,
|
|
"step": 10680
|
|
},
|
|
{
|
|
"entropy": 5.723025703430176,
|
|
"epoch": 0.8977105650073514,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004925656268993737,
|
|
"loss": 5.6257,
|
|
"mean_token_accuracy": 0.15665646344423295,
|
|
"num_tokens": 19723727.0,
|
|
"step": 10685
|
|
},
|
|
{
|
|
"entropy": 5.654568767547607,
|
|
"epoch": 0.8981306448225163,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004925579905658185,
|
|
"loss": 5.5947,
|
|
"mean_token_accuracy": 0.15805498957633973,
|
|
"num_tokens": 19732783.0,
|
|
"step": 10690
|
|
},
|
|
{
|
|
"entropy": 5.73252534866333,
|
|
"epoch": 0.8985507246376812,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004925503503783355,
|
|
"loss": 5.5656,
|
|
"mean_token_accuracy": 0.15477960258722306,
|
|
"num_tokens": 19741268.0,
|
|
"step": 10695
|
|
},
|
|
{
|
|
"entropy": 5.781156778335571,
|
|
"epoch": 0.898970804452846,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004925427063370601,
|
|
"loss": 5.489,
|
|
"mean_token_accuracy": 0.1559618294239044,
|
|
"num_tokens": 19751490.0,
|
|
"step": 10700
|
|
},
|
|
{
|
|
"entropy": 5.717162132263184,
|
|
"epoch": 0.899390884268011,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004925350584421278,
|
|
"loss": 5.5318,
|
|
"mean_token_accuracy": 0.1574894294142723,
|
|
"num_tokens": 19760487.0,
|
|
"step": 10705
|
|
},
|
|
{
|
|
"entropy": 5.703652477264404,
|
|
"epoch": 0.8998109640831758,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004925274066936738,
|
|
"loss": 5.4923,
|
|
"mean_token_accuracy": 0.16476034224033356,
|
|
"num_tokens": 19768984.0,
|
|
"step": 10710
|
|
},
|
|
{
|
|
"entropy": 5.730563688278198,
|
|
"epoch": 0.9002310438983406,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004925197510918339,
|
|
"loss": 5.5004,
|
|
"mean_token_accuracy": 0.16111984401941298,
|
|
"num_tokens": 19778335.0,
|
|
"step": 10715
|
|
},
|
|
{
|
|
"entropy": 5.710481452941894,
|
|
"epoch": 0.9006511237135055,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004925120916367435,
|
|
"loss": 5.6523,
|
|
"mean_token_accuracy": 0.1486131727695465,
|
|
"num_tokens": 19789082.0,
|
|
"step": 10720
|
|
},
|
|
{
|
|
"entropy": 5.637285423278809,
|
|
"epoch": 0.9010712035286704,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004925044283285384,
|
|
"loss": 5.353,
|
|
"mean_token_accuracy": 0.1743251711130142,
|
|
"num_tokens": 19797902.0,
|
|
"step": 10725
|
|
},
|
|
{
|
|
"entropy": 5.603871488571167,
|
|
"epoch": 0.9014912833438353,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004924967611673544,
|
|
"loss": 5.5226,
|
|
"mean_token_accuracy": 0.16380647271871568,
|
|
"num_tokens": 19806481.0,
|
|
"step": 10730
|
|
},
|
|
{
|
|
"entropy": 5.588102340698242,
|
|
"epoch": 0.9019113631590002,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004924890901533273,
|
|
"loss": 5.4262,
|
|
"mean_token_accuracy": 0.172759747505188,
|
|
"num_tokens": 19815226.0,
|
|
"step": 10735
|
|
},
|
|
{
|
|
"entropy": 5.783882665634155,
|
|
"epoch": 0.9023314429741651,
|
|
"grad_norm": 2.609375,
|
|
"learning_rate": 0.0004924814152865929,
|
|
"loss": 5.6552,
|
|
"mean_token_accuracy": 0.1513750970363617,
|
|
"num_tokens": 19824577.0,
|
|
"step": 10740
|
|
},
|
|
{
|
|
"entropy": 5.754131412506103,
|
|
"epoch": 0.90275152278933,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004924737365672873,
|
|
"loss": 5.5549,
|
|
"mean_token_accuracy": 0.1550630509853363,
|
|
"num_tokens": 19832936.0,
|
|
"step": 10745
|
|
},
|
|
{
|
|
"entropy": 5.814233350753784,
|
|
"epoch": 0.9031716026044948,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004924660539955463,
|
|
"loss": 5.6924,
|
|
"mean_token_accuracy": 0.1609632506966591,
|
|
"num_tokens": 19841946.0,
|
|
"step": 10750
|
|
},
|
|
{
|
|
"entropy": 5.760442447662354,
|
|
"epoch": 0.9035916824196597,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004924583675715063,
|
|
"loss": 5.5839,
|
|
"mean_token_accuracy": 0.15627699494361877,
|
|
"num_tokens": 19851469.0,
|
|
"step": 10755
|
|
},
|
|
{
|
|
"entropy": 5.779312658309936,
|
|
"epoch": 0.9040117622348246,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004924506772953031,
|
|
"loss": 5.6631,
|
|
"mean_token_accuracy": 0.15602973401546477,
|
|
"num_tokens": 19860731.0,
|
|
"step": 10760
|
|
},
|
|
{
|
|
"entropy": 5.787699460983276,
|
|
"epoch": 0.9044318420499895,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004924429831670733,
|
|
"loss": 5.6568,
|
|
"mean_token_accuracy": 0.15296549201011658,
|
|
"num_tokens": 19869717.0,
|
|
"step": 10765
|
|
},
|
|
{
|
|
"entropy": 5.779390621185303,
|
|
"epoch": 0.9048519218651544,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000492435285186953,
|
|
"loss": 5.6022,
|
|
"mean_token_accuracy": 0.16025329232215882,
|
|
"num_tokens": 19879229.0,
|
|
"step": 10770
|
|
},
|
|
{
|
|
"entropy": 5.836106300354004,
|
|
"epoch": 0.9052720016803193,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004924275833550785,
|
|
"loss": 5.6011,
|
|
"mean_token_accuracy": 0.15716341137886047,
|
|
"num_tokens": 19888260.0,
|
|
"step": 10775
|
|
},
|
|
{
|
|
"entropy": 5.820453453063965,
|
|
"epoch": 0.9056920814954842,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004924198776715865,
|
|
"loss": 5.6123,
|
|
"mean_token_accuracy": 0.16213833093643187,
|
|
"num_tokens": 19897070.0,
|
|
"step": 10780
|
|
},
|
|
{
|
|
"entropy": 5.75965895652771,
|
|
"epoch": 0.906112161310649,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004924121681366132,
|
|
"loss": 5.603,
|
|
"mean_token_accuracy": 0.15155648514628411,
|
|
"num_tokens": 19907170.0,
|
|
"step": 10785
|
|
},
|
|
{
|
|
"entropy": 5.723458623886108,
|
|
"epoch": 0.9065322411258139,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004924044547502951,
|
|
"loss": 5.5437,
|
|
"mean_token_accuracy": 0.15625743418931962,
|
|
"num_tokens": 19917220.0,
|
|
"step": 10790
|
|
},
|
|
{
|
|
"entropy": 5.658956575393677,
|
|
"epoch": 0.9069523209409788,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004923967375127692,
|
|
"loss": 5.6019,
|
|
"mean_token_accuracy": 0.1578077644109726,
|
|
"num_tokens": 19926724.0,
|
|
"step": 10795
|
|
},
|
|
{
|
|
"entropy": 5.811443948745728,
|
|
"epoch": 0.9073724007561437,
|
|
"grad_norm": 2.5,
|
|
"learning_rate": 0.000492389016424172,
|
|
"loss": 5.7088,
|
|
"mean_token_accuracy": 0.1508877694606781,
|
|
"num_tokens": 19936429.0,
|
|
"step": 10800
|
|
},
|
|
{
|
|
"entropy": 5.7526062488555905,
|
|
"epoch": 0.9077924805713086,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004923812914846404,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.1610741063952446,
|
|
"num_tokens": 19945096.0,
|
|
"step": 10805
|
|
},
|
|
{
|
|
"entropy": 5.742060136795044,
|
|
"epoch": 0.9082125603864735,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004923735626943111,
|
|
"loss": 5.5539,
|
|
"mean_token_accuracy": 0.16053722351789473,
|
|
"num_tokens": 19953560.0,
|
|
"step": 10810
|
|
},
|
|
{
|
|
"entropy": 5.70956597328186,
|
|
"epoch": 0.9086326402016384,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004923658300533211,
|
|
"loss": 5.5411,
|
|
"mean_token_accuracy": 0.15640968233346939,
|
|
"num_tokens": 19962669.0,
|
|
"step": 10815
|
|
},
|
|
{
|
|
"entropy": 5.794610786437988,
|
|
"epoch": 0.9090527200168032,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004923580935618073,
|
|
"loss": 5.5757,
|
|
"mean_token_accuracy": 0.1538527265191078,
|
|
"num_tokens": 19971990.0,
|
|
"step": 10820
|
|
},
|
|
{
|
|
"entropy": 5.746137094497681,
|
|
"epoch": 0.909472799831968,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004923503532199069,
|
|
"loss": 5.5822,
|
|
"mean_token_accuracy": 0.15799881666898727,
|
|
"num_tokens": 19981850.0,
|
|
"step": 10825
|
|
},
|
|
{
|
|
"entropy": 5.759499168395996,
|
|
"epoch": 0.909892879647133,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004923426090277567,
|
|
"loss": 5.6109,
|
|
"mean_token_accuracy": 0.15298285335302353,
|
|
"num_tokens": 19991574.0,
|
|
"step": 10830
|
|
},
|
|
{
|
|
"entropy": 5.754549360275268,
|
|
"epoch": 0.9103129594622978,
|
|
"grad_norm": 2.78125,
|
|
"learning_rate": 0.0004923348609854943,
|
|
"loss": 5.5906,
|
|
"mean_token_accuracy": 0.15852705538272857,
|
|
"num_tokens": 20001392.0,
|
|
"step": 10835
|
|
},
|
|
{
|
|
"entropy": 5.73292727470398,
|
|
"epoch": 0.9107330392774627,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004923271090932566,
|
|
"loss": 5.6202,
|
|
"mean_token_accuracy": 0.1504620835185051,
|
|
"num_tokens": 20011277.0,
|
|
"step": 10840
|
|
},
|
|
{
|
|
"entropy": 5.673085355758667,
|
|
"epoch": 0.9111531190926276,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004923193533511812,
|
|
"loss": 5.536,
|
|
"mean_token_accuracy": 0.15325282141566277,
|
|
"num_tokens": 20021171.0,
|
|
"step": 10845
|
|
},
|
|
{
|
|
"entropy": 5.827489376068115,
|
|
"epoch": 0.9115731989077924,
|
|
"grad_norm": 2.421875,
|
|
"learning_rate": 0.0004923115937594053,
|
|
"loss": 5.61,
|
|
"mean_token_accuracy": 0.15670632869005202,
|
|
"num_tokens": 20030189.0,
|
|
"step": 10850
|
|
},
|
|
{
|
|
"entropy": 5.8075799465179445,
|
|
"epoch": 0.9119932787229573,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004923038303180664,
|
|
"loss": 5.5889,
|
|
"mean_token_accuracy": 0.1611725553870201,
|
|
"num_tokens": 20038287.0,
|
|
"step": 10855
|
|
},
|
|
{
|
|
"entropy": 5.653801584243775,
|
|
"epoch": 0.9124133585381222,
|
|
"grad_norm": 3.515625,
|
|
"learning_rate": 0.000492296063027302,
|
|
"loss": 5.5977,
|
|
"mean_token_accuracy": 0.15539312362670898,
|
|
"num_tokens": 20047653.0,
|
|
"step": 10860
|
|
},
|
|
{
|
|
"entropy": 5.692626237869263,
|
|
"epoch": 0.9128334383532871,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004922882918872498,
|
|
"loss": 5.5947,
|
|
"mean_token_accuracy": 0.1545352503657341,
|
|
"num_tokens": 20057415.0,
|
|
"step": 10865
|
|
},
|
|
{
|
|
"entropy": 5.82832670211792,
|
|
"epoch": 0.913253518168452,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004922805168980475,
|
|
"loss": 5.5944,
|
|
"mean_token_accuracy": 0.15924015045166015,
|
|
"num_tokens": 20065996.0,
|
|
"step": 10870
|
|
},
|
|
{
|
|
"entropy": 5.750165605545044,
|
|
"epoch": 0.9136735979836169,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004922727380598326,
|
|
"loss": 5.5405,
|
|
"mean_token_accuracy": 0.15805808305740357,
|
|
"num_tokens": 20075376.0,
|
|
"step": 10875
|
|
},
|
|
{
|
|
"entropy": 5.684902000427246,
|
|
"epoch": 0.9140936777987818,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.000492264955372743,
|
|
"loss": 5.5865,
|
|
"mean_token_accuracy": 0.14841223880648613,
|
|
"num_tokens": 20084950.0,
|
|
"step": 10880
|
|
},
|
|
{
|
|
"entropy": 5.780635118484497,
|
|
"epoch": 0.9145137576139466,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004922571688369165,
|
|
"loss": 5.5569,
|
|
"mean_token_accuracy": 0.16063966900110244,
|
|
"num_tokens": 20094011.0,
|
|
"step": 10885
|
|
},
|
|
{
|
|
"entropy": 5.734232330322266,
|
|
"epoch": 0.9149338374291115,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004922493784524914,
|
|
"loss": 5.5363,
|
|
"mean_token_accuracy": 0.163571584969759,
|
|
"num_tokens": 20103037.0,
|
|
"step": 10890
|
|
},
|
|
{
|
|
"entropy": 5.757091379165649,
|
|
"epoch": 0.9153539172442764,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004922415842196052,
|
|
"loss": 5.6651,
|
|
"mean_token_accuracy": 0.14362147375941275,
|
|
"num_tokens": 20112727.0,
|
|
"step": 10895
|
|
},
|
|
{
|
|
"entropy": 5.6311070919036865,
|
|
"epoch": 0.9157739970594413,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004922337861383963,
|
|
"loss": 5.496,
|
|
"mean_token_accuracy": 0.16367468982934952,
|
|
"num_tokens": 20122341.0,
|
|
"step": 10900
|
|
},
|
|
{
|
|
"entropy": 5.730766153335571,
|
|
"epoch": 0.9161940768746062,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004922259842090027,
|
|
"loss": 5.4708,
|
|
"mean_token_accuracy": 0.16051837801933289,
|
|
"num_tokens": 20131354.0,
|
|
"step": 10905
|
|
},
|
|
{
|
|
"entropy": 5.721527194976806,
|
|
"epoch": 0.9166141566897711,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004922181784315627,
|
|
"loss": 5.5248,
|
|
"mean_token_accuracy": 0.16043773144483567,
|
|
"num_tokens": 20140440.0,
|
|
"step": 10910
|
|
},
|
|
{
|
|
"entropy": 5.667795944213867,
|
|
"epoch": 0.917034236504936,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.0004922103688062145,
|
|
"loss": 5.5163,
|
|
"mean_token_accuracy": 0.16204050332307815,
|
|
"num_tokens": 20149331.0,
|
|
"step": 10915
|
|
},
|
|
{
|
|
"entropy": 5.7060816287994385,
|
|
"epoch": 0.9174543163201008,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004922025553330964,
|
|
"loss": 5.4956,
|
|
"mean_token_accuracy": 0.16326858401298522,
|
|
"num_tokens": 20158566.0,
|
|
"step": 10920
|
|
},
|
|
{
|
|
"entropy": 5.75770206451416,
|
|
"epoch": 0.9178743961352657,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.000492194738012347,
|
|
"loss": 5.5981,
|
|
"mean_token_accuracy": 0.15985073149204254,
|
|
"num_tokens": 20168339.0,
|
|
"step": 10925
|
|
},
|
|
{
|
|
"entropy": 5.830264472961426,
|
|
"epoch": 0.9182944759504306,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004921869168441045,
|
|
"loss": 5.6149,
|
|
"mean_token_accuracy": 0.1499125950038433,
|
|
"num_tokens": 20177967.0,
|
|
"step": 10930
|
|
},
|
|
{
|
|
"entropy": 5.692331171035766,
|
|
"epoch": 0.9187145557655955,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004921790918285077,
|
|
"loss": 5.6048,
|
|
"mean_token_accuracy": 0.1557812660932541,
|
|
"num_tokens": 20187279.0,
|
|
"step": 10935
|
|
},
|
|
{
|
|
"entropy": 5.700629711151123,
|
|
"epoch": 0.9191346355807604,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004921712629656951,
|
|
"loss": 5.7076,
|
|
"mean_token_accuracy": 0.16659070774912835,
|
|
"num_tokens": 20195324.0,
|
|
"step": 10940
|
|
},
|
|
{
|
|
"entropy": 5.841782522201538,
|
|
"epoch": 0.9195547153959253,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004921634302558054,
|
|
"loss": 5.6266,
|
|
"mean_token_accuracy": 0.15811780244112014,
|
|
"num_tokens": 20204985.0,
|
|
"step": 10945
|
|
},
|
|
{
|
|
"entropy": 5.764072608947754,
|
|
"epoch": 0.9199747952110902,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004921555936989773,
|
|
"loss": 5.6432,
|
|
"mean_token_accuracy": 0.15192149430513383,
|
|
"num_tokens": 20214553.0,
|
|
"step": 10950
|
|
},
|
|
{
|
|
"entropy": 5.758094644546508,
|
|
"epoch": 0.9203948750262549,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004921477532953497,
|
|
"loss": 5.5565,
|
|
"mean_token_accuracy": 0.15732019394636154,
|
|
"num_tokens": 20224118.0,
|
|
"step": 10955
|
|
},
|
|
{
|
|
"entropy": 5.722699308395386,
|
|
"epoch": 0.9208149548414198,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004921399090450616,
|
|
"loss": 5.4984,
|
|
"mean_token_accuracy": 0.15792709290981294,
|
|
"num_tokens": 20233719.0,
|
|
"step": 10960
|
|
},
|
|
{
|
|
"entropy": 5.747791290283203,
|
|
"epoch": 0.9212350346565847,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004921320609482517,
|
|
"loss": 5.6043,
|
|
"mean_token_accuracy": 0.1583334594964981,
|
|
"num_tokens": 20242311.0,
|
|
"step": 10965
|
|
},
|
|
{
|
|
"entropy": 5.80935845375061,
|
|
"epoch": 0.9216551144717496,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004921242090050591,
|
|
"loss": 5.6304,
|
|
"mean_token_accuracy": 0.15593896806240082,
|
|
"num_tokens": 20252998.0,
|
|
"step": 10970
|
|
},
|
|
{
|
|
"entropy": 5.825342416763306,
|
|
"epoch": 0.9220751942869145,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.000492116353215623,
|
|
"loss": 5.673,
|
|
"mean_token_accuracy": 0.15853351205587388,
|
|
"num_tokens": 20262456.0,
|
|
"step": 10975
|
|
},
|
|
{
|
|
"entropy": 5.665419816970825,
|
|
"epoch": 0.9224952741020794,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004921084935800825,
|
|
"loss": 5.4387,
|
|
"mean_token_accuracy": 0.16515873968601227,
|
|
"num_tokens": 20271516.0,
|
|
"step": 10980
|
|
},
|
|
{
|
|
"entropy": 5.676522207260132,
|
|
"epoch": 0.9229153539172443,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004921006300985768,
|
|
"loss": 5.4974,
|
|
"mean_token_accuracy": 0.16081881821155547,
|
|
"num_tokens": 20280373.0,
|
|
"step": 10985
|
|
},
|
|
{
|
|
"entropy": 5.7067461013793945,
|
|
"epoch": 0.9233354337324091,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004920927627712453,
|
|
"loss": 5.4998,
|
|
"mean_token_accuracy": 0.16111356168985366,
|
|
"num_tokens": 20289426.0,
|
|
"step": 10990
|
|
},
|
|
{
|
|
"entropy": 5.799027299880981,
|
|
"epoch": 0.923755513547574,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004920848915982273,
|
|
"loss": 5.6429,
|
|
"mean_token_accuracy": 0.15812461227178573,
|
|
"num_tokens": 20298045.0,
|
|
"step": 10995
|
|
},
|
|
{
|
|
"entropy": 5.667883491516113,
|
|
"epoch": 0.9241755933627389,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004920770165796622,
|
|
"loss": 5.5306,
|
|
"mean_token_accuracy": 0.15947688817977906,
|
|
"num_tokens": 20307352.0,
|
|
"step": 11000
|
|
},
|
|
{
|
|
"entropy": 5.72949481010437,
|
|
"epoch": 0.9245956731779038,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004920691377156895,
|
|
"loss": 5.5577,
|
|
"mean_token_accuracy": 0.15705136358737945,
|
|
"num_tokens": 20316448.0,
|
|
"step": 11005
|
|
},
|
|
{
|
|
"entropy": 5.833560037612915,
|
|
"epoch": 0.9250157529930687,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0004920612550064488,
|
|
"loss": 5.712,
|
|
"mean_token_accuracy": 0.15252626091241836,
|
|
"num_tokens": 20326440.0,
|
|
"step": 11010
|
|
},
|
|
{
|
|
"entropy": 5.694369506835938,
|
|
"epoch": 0.9254358328082336,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004920533684520797,
|
|
"loss": 5.4748,
|
|
"mean_token_accuracy": 0.16088208854198455,
|
|
"num_tokens": 20335447.0,
|
|
"step": 11015
|
|
},
|
|
{
|
|
"entropy": 5.68902063369751,
|
|
"epoch": 0.9258559126233984,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000492045478052722,
|
|
"loss": 5.6386,
|
|
"mean_token_accuracy": 0.15640645027160643,
|
|
"num_tokens": 20344523.0,
|
|
"step": 11020
|
|
},
|
|
{
|
|
"entropy": 5.749677991867065,
|
|
"epoch": 0.9262759924385633,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004920375838085154,
|
|
"loss": 5.5937,
|
|
"mean_token_accuracy": 0.15958297103643418,
|
|
"num_tokens": 20354267.0,
|
|
"step": 11025
|
|
},
|
|
{
|
|
"entropy": 5.802869653701782,
|
|
"epoch": 0.9266960722537282,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004920296857195998,
|
|
"loss": 5.6479,
|
|
"mean_token_accuracy": 0.1565267562866211,
|
|
"num_tokens": 20364137.0,
|
|
"step": 11030
|
|
},
|
|
{
|
|
"entropy": 5.738079071044922,
|
|
"epoch": 0.9271161520688931,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.000492021783786115,
|
|
"loss": 5.5488,
|
|
"mean_token_accuracy": 0.16330525726079942,
|
|
"num_tokens": 20372583.0,
|
|
"step": 11035
|
|
},
|
|
{
|
|
"entropy": 5.696683788299561,
|
|
"epoch": 0.927536231884058,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004920138780082011,
|
|
"loss": 5.4984,
|
|
"mean_token_accuracy": 0.1606284871697426,
|
|
"num_tokens": 20382050.0,
|
|
"step": 11040
|
|
},
|
|
{
|
|
"entropy": 5.689135313034058,
|
|
"epoch": 0.9279563116992229,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004920059683859981,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.16342781931161882,
|
|
"num_tokens": 20391425.0,
|
|
"step": 11045
|
|
},
|
|
{
|
|
"entropy": 5.7500804424285885,
|
|
"epoch": 0.9283763915143878,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004919980549196461,
|
|
"loss": 5.6327,
|
|
"mean_token_accuracy": 0.1570240467786789,
|
|
"num_tokens": 20400559.0,
|
|
"step": 11050
|
|
},
|
|
{
|
|
"entropy": 5.770569467544556,
|
|
"epoch": 0.9287964713295526,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004919901376092853,
|
|
"loss": 5.5516,
|
|
"mean_token_accuracy": 0.15826557129621505,
|
|
"num_tokens": 20408985.0,
|
|
"step": 11055
|
|
},
|
|
{
|
|
"entropy": 5.722348403930664,
|
|
"epoch": 0.9292165511447175,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004919822164550559,
|
|
"loss": 5.6533,
|
|
"mean_token_accuracy": 0.14333268254995346,
|
|
"num_tokens": 20417855.0,
|
|
"step": 11060
|
|
},
|
|
{
|
|
"entropy": 5.745558738708496,
|
|
"epoch": 0.9296366309598824,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004919742914570983,
|
|
"loss": 5.61,
|
|
"mean_token_accuracy": 0.1635189712047577,
|
|
"num_tokens": 20426191.0,
|
|
"step": 11065
|
|
},
|
|
{
|
|
"entropy": 5.728816938400269,
|
|
"epoch": 0.9300567107750473,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.000491966362615553,
|
|
"loss": 5.5954,
|
|
"mean_token_accuracy": 0.14999182894825935,
|
|
"num_tokens": 20435592.0,
|
|
"step": 11070
|
|
},
|
|
{
|
|
"entropy": 5.795909309387207,
|
|
"epoch": 0.9304767905902122,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00049195842993056,
|
|
"loss": 5.6229,
|
|
"mean_token_accuracy": 0.15422592610120772,
|
|
"num_tokens": 20445504.0,
|
|
"step": 11075
|
|
},
|
|
{
|
|
"entropy": 5.764322519302368,
|
|
"epoch": 0.930896870405377,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004919504934022604,
|
|
"loss": 5.542,
|
|
"mean_token_accuracy": 0.15572449266910554,
|
|
"num_tokens": 20455153.0,
|
|
"step": 11080
|
|
},
|
|
{
|
|
"entropy": 5.667031097412109,
|
|
"epoch": 0.931316950220542,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004919425530307943,
|
|
"loss": 5.5469,
|
|
"mean_token_accuracy": 0.1548906832933426,
|
|
"num_tokens": 20465101.0,
|
|
"step": 11085
|
|
},
|
|
{
|
|
"entropy": 5.717367219924927,
|
|
"epoch": 0.9317370300357067,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004919346088163028,
|
|
"loss": 5.6009,
|
|
"mean_token_accuracy": 0.15931420922279357,
|
|
"num_tokens": 20474700.0,
|
|
"step": 11090
|
|
},
|
|
{
|
|
"entropy": 5.774007034301758,
|
|
"epoch": 0.9321571098508716,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004919266607589263,
|
|
"loss": 5.6202,
|
|
"mean_token_accuracy": 0.1513561874628067,
|
|
"num_tokens": 20483945.0,
|
|
"step": 11095
|
|
},
|
|
{
|
|
"entropy": 5.737395286560059,
|
|
"epoch": 0.9325771896660365,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004919187088588057,
|
|
"loss": 5.598,
|
|
"mean_token_accuracy": 0.15983821004629134,
|
|
"num_tokens": 20493307.0,
|
|
"step": 11100
|
|
},
|
|
{
|
|
"entropy": 5.709204387664795,
|
|
"epoch": 0.9329972694812014,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004919107531160819,
|
|
"loss": 5.511,
|
|
"mean_token_accuracy": 0.16708060055971147,
|
|
"num_tokens": 20501889.0,
|
|
"step": 11105
|
|
},
|
|
{
|
|
"entropy": 5.6891406059265135,
|
|
"epoch": 0.9334173492963663,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004919027935308957,
|
|
"loss": 5.5612,
|
|
"mean_token_accuracy": 0.16252549216151238,
|
|
"num_tokens": 20510577.0,
|
|
"step": 11110
|
|
},
|
|
{
|
|
"entropy": 5.646807098388672,
|
|
"epoch": 0.9338374291115312,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004918948301033884,
|
|
"loss": 5.5304,
|
|
"mean_token_accuracy": 0.1584120899438858,
|
|
"num_tokens": 20520025.0,
|
|
"step": 11115
|
|
},
|
|
{
|
|
"entropy": 5.815218305587768,
|
|
"epoch": 0.9342575089266961,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004918868628337007,
|
|
"loss": 5.5602,
|
|
"mean_token_accuracy": 0.16129291653633118,
|
|
"num_tokens": 20528989.0,
|
|
"step": 11120
|
|
},
|
|
{
|
|
"entropy": 5.745483779907227,
|
|
"epoch": 0.9346775887418609,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004918788917219739,
|
|
"loss": 5.5327,
|
|
"mean_token_accuracy": 0.15650796443223952,
|
|
"num_tokens": 20538328.0,
|
|
"step": 11125
|
|
},
|
|
{
|
|
"entropy": 5.694914293289185,
|
|
"epoch": 0.9350976685570258,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004918709167683493,
|
|
"loss": 5.6755,
|
|
"mean_token_accuracy": 0.14997887313365937,
|
|
"num_tokens": 20548069.0,
|
|
"step": 11130
|
|
},
|
|
{
|
|
"entropy": 5.676989889144897,
|
|
"epoch": 0.9355177483721907,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004918629379729681,
|
|
"loss": 5.4148,
|
|
"mean_token_accuracy": 0.16828625798225402,
|
|
"num_tokens": 20557128.0,
|
|
"step": 11135
|
|
},
|
|
{
|
|
"entropy": 5.675128316879272,
|
|
"epoch": 0.9359378281873556,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004918549553359715,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.15687671005725862,
|
|
"num_tokens": 20566352.0,
|
|
"step": 11140
|
|
},
|
|
{
|
|
"entropy": 5.729965591430664,
|
|
"epoch": 0.9363579080025205,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004918469688575012,
|
|
"loss": 5.5638,
|
|
"mean_token_accuracy": 0.15703129321336745,
|
|
"num_tokens": 20575814.0,
|
|
"step": 11145
|
|
},
|
|
{
|
|
"entropy": 5.715324926376343,
|
|
"epoch": 0.9367779878176854,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004918389785376983,
|
|
"loss": 5.4586,
|
|
"mean_token_accuracy": 0.15755079239606856,
|
|
"num_tokens": 20584715.0,
|
|
"step": 11150
|
|
},
|
|
{
|
|
"entropy": 5.646192789077759,
|
|
"epoch": 0.9371980676328503,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004918309843767047,
|
|
"loss": 5.5208,
|
|
"mean_token_accuracy": 0.15936681032180786,
|
|
"num_tokens": 20594630.0,
|
|
"step": 11155
|
|
},
|
|
{
|
|
"entropy": 5.648511266708374,
|
|
"epoch": 0.9376181474480151,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004918229863746618,
|
|
"loss": 5.5117,
|
|
"mean_token_accuracy": 0.15583572238683702,
|
|
"num_tokens": 20603653.0,
|
|
"step": 11160
|
|
},
|
|
{
|
|
"entropy": 5.783355236053467,
|
|
"epoch": 0.93803822726318,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004918149845317114,
|
|
"loss": 5.5747,
|
|
"mean_token_accuracy": 0.15749593526124955,
|
|
"num_tokens": 20612188.0,
|
|
"step": 11165
|
|
},
|
|
{
|
|
"entropy": 5.7411479473114015,
|
|
"epoch": 0.9384583070783449,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004918069788479952,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.16424321234226227,
|
|
"num_tokens": 20620933.0,
|
|
"step": 11170
|
|
},
|
|
{
|
|
"entropy": 5.719698190689087,
|
|
"epoch": 0.9388783868935098,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004917989693236549,
|
|
"loss": 5.5437,
|
|
"mean_token_accuracy": 0.16058558821678162,
|
|
"num_tokens": 20629919.0,
|
|
"step": 11175
|
|
},
|
|
{
|
|
"entropy": 5.704256439208985,
|
|
"epoch": 0.9392984667086747,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004917909559588326,
|
|
"loss": 5.5195,
|
|
"mean_token_accuracy": 0.15566968470811843,
|
|
"num_tokens": 20638475.0,
|
|
"step": 11180
|
|
},
|
|
{
|
|
"entropy": 5.814646291732788,
|
|
"epoch": 0.9397185465238396,
|
|
"grad_norm": 2.71875,
|
|
"learning_rate": 0.00049178293875367,
|
|
"loss": 5.6583,
|
|
"mean_token_accuracy": 0.14793116524815558,
|
|
"num_tokens": 20648105.0,
|
|
"step": 11185
|
|
},
|
|
{
|
|
"entropy": 5.706754207611084,
|
|
"epoch": 0.9401386263390044,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004917749177083094,
|
|
"loss": 5.5435,
|
|
"mean_token_accuracy": 0.1529379442334175,
|
|
"num_tokens": 20657527.0,
|
|
"step": 11190
|
|
},
|
|
{
|
|
"entropy": 5.738810300827026,
|
|
"epoch": 0.9405587061541693,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004917668928228927,
|
|
"loss": 5.5578,
|
|
"mean_token_accuracy": 0.1584453448653221,
|
|
"num_tokens": 20666375.0,
|
|
"step": 11195
|
|
},
|
|
{
|
|
"entropy": 5.731826686859131,
|
|
"epoch": 0.9409787859693342,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004917588640975622,
|
|
"loss": 5.4996,
|
|
"mean_token_accuracy": 0.1631929226219654,
|
|
"num_tokens": 20675350.0,
|
|
"step": 11200
|
|
},
|
|
{
|
|
"entropy": 5.603848695755005,
|
|
"epoch": 0.941398865784499,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00049175083153246,
|
|
"loss": 5.4319,
|
|
"mean_token_accuracy": 0.15711961686611176,
|
|
"num_tokens": 20684072.0,
|
|
"step": 11205
|
|
},
|
|
{
|
|
"entropy": 5.622369909286499,
|
|
"epoch": 0.941818945599664,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004917427951277284,
|
|
"loss": 5.5076,
|
|
"mean_token_accuracy": 0.16261884421110154,
|
|
"num_tokens": 20692989.0,
|
|
"step": 11210
|
|
},
|
|
{
|
|
"entropy": 5.732786989212036,
|
|
"epoch": 0.9422390254148288,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004917347548835097,
|
|
"loss": 5.4937,
|
|
"mean_token_accuracy": 0.15980781465768815,
|
|
"num_tokens": 20701269.0,
|
|
"step": 11215
|
|
},
|
|
{
|
|
"entropy": 5.7467756271362305,
|
|
"epoch": 0.9426591052299937,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004917267107999466,
|
|
"loss": 5.5982,
|
|
"mean_token_accuracy": 0.153051495552063,
|
|
"num_tokens": 20709739.0,
|
|
"step": 11220
|
|
},
|
|
{
|
|
"entropy": 5.66306734085083,
|
|
"epoch": 0.9430791850451585,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004917186628771812,
|
|
"loss": 5.536,
|
|
"mean_token_accuracy": 0.16327179223299026,
|
|
"num_tokens": 20718950.0,
|
|
"step": 11225
|
|
},
|
|
{
|
|
"entropy": 5.68591194152832,
|
|
"epoch": 0.9434992648603234,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004917106111153565,
|
|
"loss": 5.5229,
|
|
"mean_token_accuracy": 0.1601232573390007,
|
|
"num_tokens": 20729469.0,
|
|
"step": 11230
|
|
},
|
|
{
|
|
"entropy": 5.742890501022339,
|
|
"epoch": 0.9439193446754883,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004917025555146148,
|
|
"loss": 5.5378,
|
|
"mean_token_accuracy": 0.16962186694145204,
|
|
"num_tokens": 20738231.0,
|
|
"step": 11235
|
|
},
|
|
{
|
|
"entropy": 5.788369464874267,
|
|
"epoch": 0.9443394244906532,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.000491694496075099,
|
|
"loss": 5.73,
|
|
"mean_token_accuracy": 0.1434485659003258,
|
|
"num_tokens": 20748578.0,
|
|
"step": 11240
|
|
},
|
|
{
|
|
"entropy": 5.8387247085571286,
|
|
"epoch": 0.9447595043058181,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004916864327969517,
|
|
"loss": 5.6702,
|
|
"mean_token_accuracy": 0.14663874506950378,
|
|
"num_tokens": 20759284.0,
|
|
"step": 11245
|
|
},
|
|
{
|
|
"entropy": 5.803573560714722,
|
|
"epoch": 0.945179584120983,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004916783656803158,
|
|
"loss": 5.6005,
|
|
"mean_token_accuracy": 0.16080168783664703,
|
|
"num_tokens": 20768186.0,
|
|
"step": 11250
|
|
},
|
|
{
|
|
"entropy": 5.677453899383545,
|
|
"epoch": 0.9455996639361479,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004916702947253342,
|
|
"loss": 5.4665,
|
|
"mean_token_accuracy": 0.16224973052740096,
|
|
"num_tokens": 20776711.0,
|
|
"step": 11255
|
|
},
|
|
{
|
|
"entropy": 5.703570175170898,
|
|
"epoch": 0.9460197437513127,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0004916622199321501,
|
|
"loss": 5.5457,
|
|
"mean_token_accuracy": 0.1602822169661522,
|
|
"num_tokens": 20785154.0,
|
|
"step": 11260
|
|
},
|
|
{
|
|
"entropy": 5.761015748977661,
|
|
"epoch": 0.9464398235664776,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004916541413009062,
|
|
"loss": 5.4899,
|
|
"mean_token_accuracy": 0.16483698636293412,
|
|
"num_tokens": 20794114.0,
|
|
"step": 11265
|
|
},
|
|
{
|
|
"entropy": 5.797151041030884,
|
|
"epoch": 0.9468599033816425,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004916460588317458,
|
|
"loss": 5.5829,
|
|
"mean_token_accuracy": 0.15465315878391267,
|
|
"num_tokens": 20803892.0,
|
|
"step": 11270
|
|
},
|
|
{
|
|
"entropy": 5.611593103408813,
|
|
"epoch": 0.9472799831968074,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004916379725248118,
|
|
"loss": 5.4918,
|
|
"mean_token_accuracy": 0.16398582309484483,
|
|
"num_tokens": 20812892.0,
|
|
"step": 11275
|
|
},
|
|
{
|
|
"entropy": 5.729201555252075,
|
|
"epoch": 0.9477000630119723,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004916298823802479,
|
|
"loss": 5.548,
|
|
"mean_token_accuracy": 0.1526999518275261,
|
|
"num_tokens": 20821934.0,
|
|
"step": 11280
|
|
},
|
|
{
|
|
"entropy": 5.688755464553833,
|
|
"epoch": 0.9481201428271372,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004916217883981971,
|
|
"loss": 5.4658,
|
|
"mean_token_accuracy": 0.16353494822978973,
|
|
"num_tokens": 20830100.0,
|
|
"step": 11285
|
|
},
|
|
{
|
|
"entropy": 5.661793756484985,
|
|
"epoch": 0.9485402226423021,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004916136905788029,
|
|
"loss": 5.5466,
|
|
"mean_token_accuracy": 0.15961572378873826,
|
|
"num_tokens": 20839890.0,
|
|
"step": 11290
|
|
},
|
|
{
|
|
"entropy": 5.753994560241699,
|
|
"epoch": 0.9489603024574669,
|
|
"grad_norm": 2.515625,
|
|
"learning_rate": 0.0004916055889222087,
|
|
"loss": 5.6685,
|
|
"mean_token_accuracy": 0.14428120404481887,
|
|
"num_tokens": 20848670.0,
|
|
"step": 11295
|
|
},
|
|
{
|
|
"entropy": 5.780421400070191,
|
|
"epoch": 0.9493803822726318,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.000491597483428558,
|
|
"loss": 5.4958,
|
|
"mean_token_accuracy": 0.170906001329422,
|
|
"num_tokens": 20857291.0,
|
|
"step": 11300
|
|
},
|
|
{
|
|
"entropy": 5.632902812957764,
|
|
"epoch": 0.9498004620877967,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004915893740979944,
|
|
"loss": 5.4622,
|
|
"mean_token_accuracy": 0.1642448052763939,
|
|
"num_tokens": 20865341.0,
|
|
"step": 11305
|
|
},
|
|
{
|
|
"entropy": 5.743787240982056,
|
|
"epoch": 0.9502205419029616,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004915812609306617,
|
|
"loss": 5.6081,
|
|
"mean_token_accuracy": 0.158392533659935,
|
|
"num_tokens": 20875194.0,
|
|
"step": 11310
|
|
},
|
|
{
|
|
"entropy": 5.764333438873291,
|
|
"epoch": 0.9506406217181265,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004915731439267034,
|
|
"loss": 5.5164,
|
|
"mean_token_accuracy": 0.15837468653917314,
|
|
"num_tokens": 20884831.0,
|
|
"step": 11315
|
|
},
|
|
{
|
|
"entropy": 5.652561187744141,
|
|
"epoch": 0.9510607015332914,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004915650230862634,
|
|
"loss": 5.4019,
|
|
"mean_token_accuracy": 0.16960276961326598,
|
|
"num_tokens": 20893790.0,
|
|
"step": 11320
|
|
},
|
|
{
|
|
"entropy": 5.6419471263885494,
|
|
"epoch": 0.9514807813484563,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004915568984094854,
|
|
"loss": 5.5285,
|
|
"mean_token_accuracy": 0.16155760288238524,
|
|
"num_tokens": 20902175.0,
|
|
"step": 11325
|
|
},
|
|
{
|
|
"entropy": 5.830080270767212,
|
|
"epoch": 0.951900861163621,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004915487698965136,
|
|
"loss": 5.6617,
|
|
"mean_token_accuracy": 0.15098460614681244,
|
|
"num_tokens": 20911484.0,
|
|
"step": 11330
|
|
},
|
|
{
|
|
"entropy": 5.830371952056884,
|
|
"epoch": 0.952320940978786,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004915406375474917,
|
|
"loss": 5.6201,
|
|
"mean_token_accuracy": 0.15232392996549607,
|
|
"num_tokens": 20920916.0,
|
|
"step": 11335
|
|
},
|
|
{
|
|
"entropy": 5.751460123062134,
|
|
"epoch": 0.9527410207939508,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000491532501362564,
|
|
"loss": 5.6334,
|
|
"mean_token_accuracy": 0.159017938375473,
|
|
"num_tokens": 20930219.0,
|
|
"step": 11340
|
|
},
|
|
{
|
|
"entropy": 5.692816257476807,
|
|
"epoch": 0.9531611006091157,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004915243613418745,
|
|
"loss": 5.4478,
|
|
"mean_token_accuracy": 0.16899611204862594,
|
|
"num_tokens": 20939591.0,
|
|
"step": 11345
|
|
},
|
|
{
|
|
"entropy": 5.779568099975586,
|
|
"epoch": 0.9535811804242806,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.0004915162174855675,
|
|
"loss": 5.6211,
|
|
"mean_token_accuracy": 0.1536381497979164,
|
|
"num_tokens": 20950035.0,
|
|
"step": 11350
|
|
},
|
|
{
|
|
"entropy": 5.705796003341675,
|
|
"epoch": 0.9540012602394455,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004915080697937872,
|
|
"loss": 5.5265,
|
|
"mean_token_accuracy": 0.15976119190454482,
|
|
"num_tokens": 20959168.0,
|
|
"step": 11355
|
|
},
|
|
{
|
|
"entropy": 5.632742023468017,
|
|
"epoch": 0.9544213400546103,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004914999182666779,
|
|
"loss": 5.4551,
|
|
"mean_token_accuracy": 0.1686936303973198,
|
|
"num_tokens": 20967887.0,
|
|
"step": 11360
|
|
},
|
|
{
|
|
"entropy": 5.734130430221557,
|
|
"epoch": 0.9548414198697752,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004914917629043839,
|
|
"loss": 5.5737,
|
|
"mean_token_accuracy": 0.1523936167359352,
|
|
"num_tokens": 20977558.0,
|
|
"step": 11365
|
|
},
|
|
{
|
|
"entropy": 5.661308240890503,
|
|
"epoch": 0.9552614996849401,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00049148360370705,
|
|
"loss": 5.521,
|
|
"mean_token_accuracy": 0.16197700947523117,
|
|
"num_tokens": 20986118.0,
|
|
"step": 11370
|
|
},
|
|
{
|
|
"entropy": 5.722346544265747,
|
|
"epoch": 0.955681579500105,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004914754406748204,
|
|
"loss": 5.4573,
|
|
"mean_token_accuracy": 0.16239626556634904,
|
|
"num_tokens": 20994623.0,
|
|
"step": 11375
|
|
},
|
|
{
|
|
"entropy": 5.779690885543824,
|
|
"epoch": 0.9561016593152699,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00049146727380784,
|
|
"loss": 5.637,
|
|
"mean_token_accuracy": 0.1525883451104164,
|
|
"num_tokens": 21004193.0,
|
|
"step": 11380
|
|
},
|
|
{
|
|
"entropy": 5.699955463409424,
|
|
"epoch": 0.9565217391304348,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004914591031062531,
|
|
"loss": 5.4637,
|
|
"mean_token_accuracy": 0.16423846036195755,
|
|
"num_tokens": 21013125.0,
|
|
"step": 11385
|
|
},
|
|
{
|
|
"entropy": 5.598379230499267,
|
|
"epoch": 0.9569418189455997,
|
|
"grad_norm": 1.3671875,
|
|
"learning_rate": 0.0004914509285702048,
|
|
"loss": 5.3926,
|
|
"mean_token_accuracy": 0.17079196721315384,
|
|
"num_tokens": 21021402.0,
|
|
"step": 11390
|
|
},
|
|
{
|
|
"entropy": 5.648781394958496,
|
|
"epoch": 0.9573618987607645,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004914427501998397,
|
|
"loss": 5.4695,
|
|
"mean_token_accuracy": 0.16255177408456803,
|
|
"num_tokens": 21029639.0,
|
|
"step": 11395
|
|
},
|
|
{
|
|
"entropy": 5.712826299667358,
|
|
"epoch": 0.9577819785759294,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004914345679953027,
|
|
"loss": 5.5082,
|
|
"mean_token_accuracy": 0.1599772408604622,
|
|
"num_tokens": 21037525.0,
|
|
"step": 11400
|
|
},
|
|
{
|
|
"entropy": 5.778018856048584,
|
|
"epoch": 0.9582020583910943,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004914263819567388,
|
|
"loss": 5.6048,
|
|
"mean_token_accuracy": 0.14846348613500596,
|
|
"num_tokens": 21047702.0,
|
|
"step": 11405
|
|
},
|
|
{
|
|
"entropy": 5.744657373428344,
|
|
"epoch": 0.9586221382062592,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.000491418192084293,
|
|
"loss": 5.5155,
|
|
"mean_token_accuracy": 0.16157382726669312,
|
|
"num_tokens": 21056379.0,
|
|
"step": 11410
|
|
},
|
|
{
|
|
"entropy": 5.676009654998779,
|
|
"epoch": 0.9590422180214241,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004914099983781104,
|
|
"loss": 5.5156,
|
|
"mean_token_accuracy": 0.16159558892250062,
|
|
"num_tokens": 21065283.0,
|
|
"step": 11415
|
|
},
|
|
{
|
|
"entropy": 5.707050800323486,
|
|
"epoch": 0.959462297836589,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000491401800838336,
|
|
"loss": 5.6565,
|
|
"mean_token_accuracy": 0.15205793231725692,
|
|
"num_tokens": 21074938.0,
|
|
"step": 11420
|
|
},
|
|
{
|
|
"entropy": 5.712154531478882,
|
|
"epoch": 0.9598823776517539,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004913935994651153,
|
|
"loss": 5.4786,
|
|
"mean_token_accuracy": 0.16634972542524337,
|
|
"num_tokens": 21084729.0,
|
|
"step": 11425
|
|
},
|
|
{
|
|
"entropy": 5.6186995029449465,
|
|
"epoch": 0.9603024574669187,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004913853942585932,
|
|
"loss": 5.3877,
|
|
"mean_token_accuracy": 0.16797681003808976,
|
|
"num_tokens": 21093456.0,
|
|
"step": 11430
|
|
},
|
|
{
|
|
"entropy": 5.6789297580719,
|
|
"epoch": 0.9607225372820836,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004913771852189155,
|
|
"loss": 5.5291,
|
|
"mean_token_accuracy": 0.16209679543972016,
|
|
"num_tokens": 21102980.0,
|
|
"step": 11435
|
|
},
|
|
{
|
|
"entropy": 5.837604236602783,
|
|
"epoch": 0.9611426170972485,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004913689723462271,
|
|
"loss": 5.7492,
|
|
"mean_token_accuracy": 0.16761390566825868,
|
|
"num_tokens": 21112777.0,
|
|
"step": 11440
|
|
},
|
|
{
|
|
"entropy": 5.760186672210693,
|
|
"epoch": 0.9615626969124134,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.000491360755640674,
|
|
"loss": 5.634,
|
|
"mean_token_accuracy": 0.1518979102373123,
|
|
"num_tokens": 21122139.0,
|
|
"step": 11445
|
|
},
|
|
{
|
|
"entropy": 5.685001182556152,
|
|
"epoch": 0.9619827767275783,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004913525351024014,
|
|
"loss": 5.5012,
|
|
"mean_token_accuracy": 0.15827474147081375,
|
|
"num_tokens": 21131425.0,
|
|
"step": 11450
|
|
},
|
|
{
|
|
"entropy": 5.645795059204102,
|
|
"epoch": 0.9624028565427432,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004913443107315552,
|
|
"loss": 5.4816,
|
|
"mean_token_accuracy": 0.15899788290262223,
|
|
"num_tokens": 21140784.0,
|
|
"step": 11455
|
|
},
|
|
{
|
|
"entropy": 5.737687778472901,
|
|
"epoch": 0.962822936357908,
|
|
"grad_norm": 3.328125,
|
|
"learning_rate": 0.0004913360825282807,
|
|
"loss": 5.4913,
|
|
"mean_token_accuracy": 0.16493094712495804,
|
|
"num_tokens": 21150408.0,
|
|
"step": 11460
|
|
},
|
|
{
|
|
"entropy": 5.70665397644043,
|
|
"epoch": 0.9632430161730728,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.000491327850492724,
|
|
"loss": 5.5903,
|
|
"mean_token_accuracy": 0.16218522936105728,
|
|
"num_tokens": 21158915.0,
|
|
"step": 11465
|
|
},
|
|
{
|
|
"entropy": 5.5823643684387205,
|
|
"epoch": 0.9636630959882377,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004913196146250309,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.16955134347081185,
|
|
"num_tokens": 21167336.0,
|
|
"step": 11470
|
|
},
|
|
{
|
|
"entropy": 5.790083742141723,
|
|
"epoch": 0.9640831758034026,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004913113749253472,
|
|
"loss": 5.7179,
|
|
"mean_token_accuracy": 0.15193437561392784,
|
|
"num_tokens": 21177499.0,
|
|
"step": 11475
|
|
},
|
|
{
|
|
"entropy": 5.817345762252808,
|
|
"epoch": 0.9645032556185675,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004913031313938188,
|
|
"loss": 5.6051,
|
|
"mean_token_accuracy": 0.15267798900604249,
|
|
"num_tokens": 21186961.0,
|
|
"step": 11480
|
|
},
|
|
{
|
|
"entropy": 5.753744745254517,
|
|
"epoch": 0.9649233354337324,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004912948840305919,
|
|
"loss": 5.5022,
|
|
"mean_token_accuracy": 0.1661657139658928,
|
|
"num_tokens": 21196364.0,
|
|
"step": 11485
|
|
},
|
|
{
|
|
"entropy": 5.727255821228027,
|
|
"epoch": 0.9653434152488973,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004912866328358125,
|
|
"loss": 5.5777,
|
|
"mean_token_accuracy": 0.15882964730262755,
|
|
"num_tokens": 21206376.0,
|
|
"step": 11490
|
|
},
|
|
{
|
|
"entropy": 5.742443180084228,
|
|
"epoch": 0.9657634950640621,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004912783778096266,
|
|
"loss": 5.5585,
|
|
"mean_token_accuracy": 0.16211859583854676,
|
|
"num_tokens": 21215889.0,
|
|
"step": 11495
|
|
},
|
|
{
|
|
"entropy": 5.764708566665649,
|
|
"epoch": 0.966183574879227,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004912701189521808,
|
|
"loss": 5.5637,
|
|
"mean_token_accuracy": 0.1641128808259964,
|
|
"num_tokens": 21224959.0,
|
|
"step": 11500
|
|
},
|
|
{
|
|
"entropy": 5.76860499382019,
|
|
"epoch": 0.9666036546943919,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004912618562636211,
|
|
"loss": 5.6748,
|
|
"mean_token_accuracy": 0.150497405230999,
|
|
"num_tokens": 21234495.0,
|
|
"step": 11505
|
|
},
|
|
{
|
|
"entropy": 5.671495676040649,
|
|
"epoch": 0.9670237345095568,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.000491253589744094,
|
|
"loss": 5.5133,
|
|
"mean_token_accuracy": 0.1606043353676796,
|
|
"num_tokens": 21244555.0,
|
|
"step": 11510
|
|
},
|
|
{
|
|
"entropy": 5.798943901062012,
|
|
"epoch": 0.9674438143247217,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004912453193937459,
|
|
"loss": 5.6487,
|
|
"mean_token_accuracy": 0.15496133714914323,
|
|
"num_tokens": 21254199.0,
|
|
"step": 11515
|
|
},
|
|
{
|
|
"entropy": 5.775474929809571,
|
|
"epoch": 0.9678638941398866,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004912370452127234,
|
|
"loss": 5.5574,
|
|
"mean_token_accuracy": 0.15863036513328552,
|
|
"num_tokens": 21262723.0,
|
|
"step": 11520
|
|
},
|
|
{
|
|
"entropy": 5.699438953399659,
|
|
"epoch": 0.9682839739550515,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004912287672011728,
|
|
"loss": 5.4811,
|
|
"mean_token_accuracy": 0.16491955220699311,
|
|
"num_tokens": 21271283.0,
|
|
"step": 11525
|
|
},
|
|
{
|
|
"entropy": 5.647049522399902,
|
|
"epoch": 0.9687040537702163,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004912204853592411,
|
|
"loss": 5.5223,
|
|
"mean_token_accuracy": 0.1697501629590988,
|
|
"num_tokens": 21279542.0,
|
|
"step": 11530
|
|
},
|
|
{
|
|
"entropy": 5.697355556488037,
|
|
"epoch": 0.9691241335853812,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004912121996870748,
|
|
"loss": 5.5099,
|
|
"mean_token_accuracy": 0.1619595393538475,
|
|
"num_tokens": 21288678.0,
|
|
"step": 11535
|
|
},
|
|
{
|
|
"entropy": 5.812397384643555,
|
|
"epoch": 0.9695442134005461,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004912039101848207,
|
|
"loss": 5.6565,
|
|
"mean_token_accuracy": 0.15555005446076392,
|
|
"num_tokens": 21298982.0,
|
|
"step": 11540
|
|
},
|
|
{
|
|
"entropy": 5.709640407562256,
|
|
"epoch": 0.969964293215711,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004911956168526257,
|
|
"loss": 5.5863,
|
|
"mean_token_accuracy": 0.15883329659700393,
|
|
"num_tokens": 21307663.0,
|
|
"step": 11545
|
|
},
|
|
{
|
|
"entropy": 5.733417558670044,
|
|
"epoch": 0.9703843730308759,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004911873196906366,
|
|
"loss": 5.5912,
|
|
"mean_token_accuracy": 0.15337843149900438,
|
|
"num_tokens": 21318004.0,
|
|
"step": 11550
|
|
},
|
|
{
|
|
"entropy": 5.624382829666137,
|
|
"epoch": 0.9708044528460408,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004911790186990005,
|
|
"loss": 5.4055,
|
|
"mean_token_accuracy": 0.17055178582668304,
|
|
"num_tokens": 21327373.0,
|
|
"step": 11555
|
|
},
|
|
{
|
|
"entropy": 5.6697979927062985,
|
|
"epoch": 0.9712245326612057,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004911707138778643,
|
|
"loss": 5.5004,
|
|
"mean_token_accuracy": 0.15926228910684587,
|
|
"num_tokens": 21335654.0,
|
|
"step": 11560
|
|
},
|
|
{
|
|
"entropy": 5.797804164886474,
|
|
"epoch": 0.9716446124763705,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004911624052273754,
|
|
"loss": 5.5656,
|
|
"mean_token_accuracy": 0.15943690538406372,
|
|
"num_tokens": 21344464.0,
|
|
"step": 11565
|
|
},
|
|
{
|
|
"entropy": 5.783263635635376,
|
|
"epoch": 0.9720646922915354,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004911540927476807,
|
|
"loss": 5.6574,
|
|
"mean_token_accuracy": 0.1580268144607544,
|
|
"num_tokens": 21354121.0,
|
|
"step": 11570
|
|
},
|
|
{
|
|
"entropy": 5.765107583999634,
|
|
"epoch": 0.9724847721067003,
|
|
"grad_norm": 2.296875,
|
|
"learning_rate": 0.0004911457764389275,
|
|
"loss": 5.5917,
|
|
"mean_token_accuracy": 0.16147245317697526,
|
|
"num_tokens": 21363395.0,
|
|
"step": 11575
|
|
},
|
|
{
|
|
"entropy": 5.731990480422974,
|
|
"epoch": 0.9729048519218652,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004911374563012633,
|
|
"loss": 5.5543,
|
|
"mean_token_accuracy": 0.15289353728294372,
|
|
"num_tokens": 21372126.0,
|
|
"step": 11580
|
|
},
|
|
{
|
|
"entropy": 5.778444862365722,
|
|
"epoch": 0.97332493173703,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004911291323348352,
|
|
"loss": 5.6253,
|
|
"mean_token_accuracy": 0.157344251871109,
|
|
"num_tokens": 21380554.0,
|
|
"step": 11585
|
|
},
|
|
{
|
|
"entropy": 5.706854677200317,
|
|
"epoch": 0.973745011552195,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004911208045397909,
|
|
"loss": 5.514,
|
|
"mean_token_accuracy": 0.15825875103473663,
|
|
"num_tokens": 21389317.0,
|
|
"step": 11590
|
|
},
|
|
{
|
|
"entropy": 5.79919662475586,
|
|
"epoch": 0.9741650913673598,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004911124729162778,
|
|
"loss": 5.6453,
|
|
"mean_token_accuracy": 0.1541647955775261,
|
|
"num_tokens": 21398926.0,
|
|
"step": 11595
|
|
},
|
|
{
|
|
"entropy": 5.782274341583252,
|
|
"epoch": 0.9745851711825246,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004911041374644435,
|
|
"loss": 5.4558,
|
|
"mean_token_accuracy": 0.16338740587234496,
|
|
"num_tokens": 21406962.0,
|
|
"step": 11600
|
|
},
|
|
{
|
|
"entropy": 5.757547998428345,
|
|
"epoch": 0.9750052509976895,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004910957981844357,
|
|
"loss": 5.5533,
|
|
"mean_token_accuracy": 0.16179322302341462,
|
|
"num_tokens": 21415868.0,
|
|
"step": 11605
|
|
},
|
|
{
|
|
"entropy": 5.77811369895935,
|
|
"epoch": 0.9754253308128544,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004910874550764022,
|
|
"loss": 5.6645,
|
|
"mean_token_accuracy": 0.15746580213308334,
|
|
"num_tokens": 21424544.0,
|
|
"step": 11610
|
|
},
|
|
{
|
|
"entropy": 5.659853553771972,
|
|
"epoch": 0.9758454106280193,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004910791081404907,
|
|
"loss": 5.5353,
|
|
"mean_token_accuracy": 0.16283178329467773,
|
|
"num_tokens": 21433589.0,
|
|
"step": 11615
|
|
},
|
|
{
|
|
"entropy": 5.700931072235107,
|
|
"epoch": 0.9762654904431842,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004910707573768489,
|
|
"loss": 5.5844,
|
|
"mean_token_accuracy": 0.15395784080028535,
|
|
"num_tokens": 21442084.0,
|
|
"step": 11620
|
|
},
|
|
{
|
|
"entropy": 5.728076314926147,
|
|
"epoch": 0.9766855702583491,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004910624027856251,
|
|
"loss": 5.5069,
|
|
"mean_token_accuracy": 0.16065045148134233,
|
|
"num_tokens": 21450962.0,
|
|
"step": 11625
|
|
},
|
|
{
|
|
"entropy": 5.780987930297852,
|
|
"epoch": 0.977105650073514,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004910540443669669,
|
|
"loss": 5.6095,
|
|
"mean_token_accuracy": 0.1500787116587162,
|
|
"num_tokens": 21461322.0,
|
|
"step": 11630
|
|
},
|
|
{
|
|
"entropy": 5.753719711303711,
|
|
"epoch": 0.9775257298886788,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004910456821210227,
|
|
"loss": 5.5737,
|
|
"mean_token_accuracy": 0.1601569026708603,
|
|
"num_tokens": 21470800.0,
|
|
"step": 11635
|
|
},
|
|
{
|
|
"entropy": 5.7415358543396,
|
|
"epoch": 0.9779458097038437,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004910373160479404,
|
|
"loss": 5.4411,
|
|
"mean_token_accuracy": 0.1655646875500679,
|
|
"num_tokens": 21479707.0,
|
|
"step": 11640
|
|
},
|
|
{
|
|
"entropy": 5.748549699783325,
|
|
"epoch": 0.9783658895190086,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004910289461478683,
|
|
"loss": 5.6165,
|
|
"mean_token_accuracy": 0.15316568911075593,
|
|
"num_tokens": 21489469.0,
|
|
"step": 11645
|
|
},
|
|
{
|
|
"entropy": 5.751054954528809,
|
|
"epoch": 0.9787859693341735,
|
|
"grad_norm": 2.890625,
|
|
"learning_rate": 0.0004910205724209547,
|
|
"loss": 5.5889,
|
|
"mean_token_accuracy": 0.15403057783842086,
|
|
"num_tokens": 21499226.0,
|
|
"step": 11650
|
|
},
|
|
{
|
|
"entropy": 5.619789409637451,
|
|
"epoch": 0.9792060491493384,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004910121948673478,
|
|
"loss": 5.4392,
|
|
"mean_token_accuracy": 0.1629665359854698,
|
|
"num_tokens": 21508129.0,
|
|
"step": 11655
|
|
},
|
|
{
|
|
"entropy": 5.699333000183105,
|
|
"epoch": 0.9796261289645033,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004910038134871962,
|
|
"loss": 5.4801,
|
|
"mean_token_accuracy": 0.1643253058195114,
|
|
"num_tokens": 21516293.0,
|
|
"step": 11660
|
|
},
|
|
{
|
|
"entropy": 5.8036095142364506,
|
|
"epoch": 0.9800462087796681,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004909954282806482,
|
|
"loss": 5.6387,
|
|
"mean_token_accuracy": 0.16016865894198418,
|
|
"num_tokens": 21525393.0,
|
|
"step": 11665
|
|
},
|
|
{
|
|
"entropy": 5.6630861282348635,
|
|
"epoch": 0.980466288594833,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004909870392478524,
|
|
"loss": 5.5018,
|
|
"mean_token_accuracy": 0.16075061559677123,
|
|
"num_tokens": 21534585.0,
|
|
"step": 11670
|
|
},
|
|
{
|
|
"entropy": 5.669094085693359,
|
|
"epoch": 0.9808863684099979,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004909786463889575,
|
|
"loss": 5.4321,
|
|
"mean_token_accuracy": 0.1637320727109909,
|
|
"num_tokens": 21542947.0,
|
|
"step": 11675
|
|
},
|
|
{
|
|
"entropy": 5.70667781829834,
|
|
"epoch": 0.9813064482251628,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004909702497041121,
|
|
"loss": 5.5403,
|
|
"mean_token_accuracy": 0.16008226573467255,
|
|
"num_tokens": 21552168.0,
|
|
"step": 11680
|
|
},
|
|
{
|
|
"entropy": 5.744578647613525,
|
|
"epoch": 0.9817265280403277,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004909618491934648,
|
|
"loss": 5.5414,
|
|
"mean_token_accuracy": 0.16264388114213943,
|
|
"num_tokens": 21562131.0,
|
|
"step": 11685
|
|
},
|
|
{
|
|
"entropy": 5.708233642578125,
|
|
"epoch": 0.9821466078554926,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004909534448571647,
|
|
"loss": 5.5015,
|
|
"mean_token_accuracy": 0.16573767215013505,
|
|
"num_tokens": 21571363.0,
|
|
"step": 11690
|
|
},
|
|
{
|
|
"entropy": 5.73539662361145,
|
|
"epoch": 0.9825666876706575,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004909450366953604,
|
|
"loss": 5.4867,
|
|
"mean_token_accuracy": 0.16546985059976577,
|
|
"num_tokens": 21580754.0,
|
|
"step": 11695
|
|
},
|
|
{
|
|
"entropy": 5.716777801513672,
|
|
"epoch": 0.9829867674858223,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000490936624708201,
|
|
"loss": 5.5823,
|
|
"mean_token_accuracy": 0.16040044873952866,
|
|
"num_tokens": 21590053.0,
|
|
"step": 11700
|
|
},
|
|
{
|
|
"entropy": 5.717460250854492,
|
|
"epoch": 0.9834068473009872,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004909282088958356,
|
|
"loss": 5.5319,
|
|
"mean_token_accuracy": 0.15609903037548065,
|
|
"num_tokens": 21598681.0,
|
|
"step": 11705
|
|
},
|
|
{
|
|
"entropy": 5.778473806381226,
|
|
"epoch": 0.983826927116152,
|
|
"grad_norm": 2.625,
|
|
"learning_rate": 0.000490919789258413,
|
|
"loss": 5.5699,
|
|
"mean_token_accuracy": 0.16667143106460572,
|
|
"num_tokens": 21607465.0,
|
|
"step": 11710
|
|
},
|
|
{
|
|
"entropy": 5.76135516166687,
|
|
"epoch": 0.984247006931317,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004909113657960826,
|
|
"loss": 5.6466,
|
|
"mean_token_accuracy": 0.1501710444688797,
|
|
"num_tokens": 21617480.0,
|
|
"step": 11715
|
|
},
|
|
{
|
|
"entropy": 5.732358407974243,
|
|
"epoch": 0.9846670867464818,
|
|
"grad_norm": 2.84375,
|
|
"learning_rate": 0.0004909029385089935,
|
|
"loss": 5.56,
|
|
"mean_token_accuracy": 0.16049663126468658,
|
|
"num_tokens": 21626434.0,
|
|
"step": 11720
|
|
},
|
|
{
|
|
"entropy": 5.741581773757934,
|
|
"epoch": 0.9850871665616467,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.000490894507397295,
|
|
"loss": 5.5349,
|
|
"mean_token_accuracy": 0.16318391263484955,
|
|
"num_tokens": 21635627.0,
|
|
"step": 11725
|
|
},
|
|
{
|
|
"entropy": 5.715289211273193,
|
|
"epoch": 0.9855072463768116,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004908860724611365,
|
|
"loss": 5.538,
|
|
"mean_token_accuracy": 0.1623338758945465,
|
|
"num_tokens": 21644789.0,
|
|
"step": 11730
|
|
},
|
|
{
|
|
"entropy": 5.649227333068848,
|
|
"epoch": 0.9859273261919764,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004908776337006675,
|
|
"loss": 5.5411,
|
|
"mean_token_accuracy": 0.15639186650514603,
|
|
"num_tokens": 21653696.0,
|
|
"step": 11735
|
|
},
|
|
{
|
|
"entropy": 5.758316421508789,
|
|
"epoch": 0.9863474060071413,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004908691911160373,
|
|
"loss": 5.5441,
|
|
"mean_token_accuracy": 0.14904894530773163,
|
|
"num_tokens": 21664420.0,
|
|
"step": 11740
|
|
},
|
|
{
|
|
"entropy": 5.777101802825928,
|
|
"epoch": 0.9867674858223062,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004908607447073954,
|
|
"loss": 5.532,
|
|
"mean_token_accuracy": 0.16310076788067818,
|
|
"num_tokens": 21673716.0,
|
|
"step": 11745
|
|
},
|
|
{
|
|
"entropy": 5.710586738586426,
|
|
"epoch": 0.9871875656374711,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004908522944748917,
|
|
"loss": 5.5124,
|
|
"mean_token_accuracy": 0.16769290566444398,
|
|
"num_tokens": 21682860.0,
|
|
"step": 11750
|
|
},
|
|
{
|
|
"entropy": 5.605210161209106,
|
|
"epoch": 0.987607645452636,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004908438404186758,
|
|
"loss": 5.5075,
|
|
"mean_token_accuracy": 0.17046434432268143,
|
|
"num_tokens": 21691915.0,
|
|
"step": 11755
|
|
},
|
|
{
|
|
"entropy": 5.718367576599121,
|
|
"epoch": 0.9880277252678009,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004908353825388973,
|
|
"loss": 5.6478,
|
|
"mean_token_accuracy": 0.1539776936173439,
|
|
"num_tokens": 21701666.0,
|
|
"step": 11760
|
|
},
|
|
{
|
|
"entropy": 5.8222753524780275,
|
|
"epoch": 0.9884478050829658,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004908269208357062,
|
|
"loss": 5.5593,
|
|
"mean_token_accuracy": 0.1612561821937561,
|
|
"num_tokens": 21709267.0,
|
|
"step": 11765
|
|
},
|
|
{
|
|
"entropy": 5.731483888626099,
|
|
"epoch": 0.9888678848981306,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004908184553092523,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.15959916561841964,
|
|
"num_tokens": 21718117.0,
|
|
"step": 11770
|
|
},
|
|
{
|
|
"entropy": 5.744333124160766,
|
|
"epoch": 0.9892879647132955,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004908099859596856,
|
|
"loss": 5.5856,
|
|
"mean_token_accuracy": 0.16114241033792495,
|
|
"num_tokens": 21727952.0,
|
|
"step": 11775
|
|
},
|
|
{
|
|
"entropy": 5.728325080871582,
|
|
"epoch": 0.9897080445284604,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004908015127871561,
|
|
"loss": 5.4976,
|
|
"mean_token_accuracy": 0.15698132812976837,
|
|
"num_tokens": 21737878.0,
|
|
"step": 11780
|
|
},
|
|
{
|
|
"entropy": 5.6280214309692385,
|
|
"epoch": 0.9901281243436253,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.000490793035791814,
|
|
"loss": 5.4182,
|
|
"mean_token_accuracy": 0.16316151171922683,
|
|
"num_tokens": 21747391.0,
|
|
"step": 11785
|
|
},
|
|
{
|
|
"entropy": 5.682892322540283,
|
|
"epoch": 0.9905482041587902,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004907845549738093,
|
|
"loss": 5.4602,
|
|
"mean_token_accuracy": 0.1639404833316803,
|
|
"num_tokens": 21756791.0,
|
|
"step": 11790
|
|
},
|
|
{
|
|
"entropy": 5.634412670135498,
|
|
"epoch": 0.9909682839739551,
|
|
"grad_norm": 2.328125,
|
|
"learning_rate": 0.0004907760703332923,
|
|
"loss": 5.5007,
|
|
"mean_token_accuracy": 0.162163844704628,
|
|
"num_tokens": 21766020.0,
|
|
"step": 11795
|
|
},
|
|
{
|
|
"entropy": 5.730725479125977,
|
|
"epoch": 0.99138836378912,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004907675818704134,
|
|
"loss": 5.6054,
|
|
"mean_token_accuracy": 0.15719361007213592,
|
|
"num_tokens": 21775895.0,
|
|
"step": 11800
|
|
},
|
|
{
|
|
"entropy": 5.708724880218506,
|
|
"epoch": 0.9918084436042848,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004907590895853228,
|
|
"loss": 5.5163,
|
|
"mean_token_accuracy": 0.16296806633472444,
|
|
"num_tokens": 21784543.0,
|
|
"step": 11805
|
|
},
|
|
{
|
|
"entropy": 5.711108875274658,
|
|
"epoch": 0.9922285234194497,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004907505934781712,
|
|
"loss": 5.5718,
|
|
"mean_token_accuracy": 0.1565998300909996,
|
|
"num_tokens": 21793938.0,
|
|
"step": 11810
|
|
},
|
|
{
|
|
"entropy": 5.700426530838013,
|
|
"epoch": 0.9926486032346146,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004907420935491087,
|
|
"loss": 5.5456,
|
|
"mean_token_accuracy": 0.15969295799732208,
|
|
"num_tokens": 21803641.0,
|
|
"step": 11815
|
|
},
|
|
{
|
|
"entropy": 5.692903614044189,
|
|
"epoch": 0.9930686830497795,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004907335897982862,
|
|
"loss": 5.4649,
|
|
"mean_token_accuracy": 0.1696073144674301,
|
|
"num_tokens": 21812542.0,
|
|
"step": 11820
|
|
},
|
|
{
|
|
"entropy": 5.636066865921021,
|
|
"epoch": 0.9934887628649444,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004907250822258543,
|
|
"loss": 5.5573,
|
|
"mean_token_accuracy": 0.157867032289505,
|
|
"num_tokens": 21821847.0,
|
|
"step": 11825
|
|
},
|
|
{
|
|
"entropy": 5.794892740249634,
|
|
"epoch": 0.9939088426801093,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004907165708319637,
|
|
"loss": 5.5963,
|
|
"mean_token_accuracy": 0.15924146473407746,
|
|
"num_tokens": 21830799.0,
|
|
"step": 11830
|
|
},
|
|
{
|
|
"entropy": 5.723268651962281,
|
|
"epoch": 0.994328922495274,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004907080556167651,
|
|
"loss": 5.5417,
|
|
"mean_token_accuracy": 0.16239422112703322,
|
|
"num_tokens": 21840202.0,
|
|
"step": 11835
|
|
},
|
|
{
|
|
"entropy": 5.8009960651397705,
|
|
"epoch": 0.994749002310439,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004906995365804093,
|
|
"loss": 5.6415,
|
|
"mean_token_accuracy": 0.15165836364030838,
|
|
"num_tokens": 21849701.0,
|
|
"step": 11840
|
|
},
|
|
{
|
|
"entropy": 5.7475098133087155,
|
|
"epoch": 0.9951690821256038,
|
|
"grad_norm": 3.125,
|
|
"learning_rate": 0.0004906910137230472,
|
|
"loss": 5.5083,
|
|
"mean_token_accuracy": 0.16084536910057068,
|
|
"num_tokens": 21859191.0,
|
|
"step": 11845
|
|
},
|
|
{
|
|
"entropy": 5.686148691177368,
|
|
"epoch": 0.9955891619407687,
|
|
"grad_norm": 3.546875,
|
|
"learning_rate": 0.00049068248704483,
|
|
"loss": 5.5053,
|
|
"mean_token_accuracy": 0.16064416766166686,
|
|
"num_tokens": 21867944.0,
|
|
"step": 11850
|
|
},
|
|
{
|
|
"entropy": 5.62108964920044,
|
|
"epoch": 0.9960092417559336,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004906739565459085,
|
|
"loss": 5.5337,
|
|
"mean_token_accuracy": 0.15536581873893737,
|
|
"num_tokens": 21876368.0,
|
|
"step": 11855
|
|
},
|
|
{
|
|
"entropy": 5.824420833587647,
|
|
"epoch": 0.9964293215710985,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000490665422226434,
|
|
"loss": 5.6125,
|
|
"mean_token_accuracy": 0.15335456728935243,
|
|
"num_tokens": 21885634.0,
|
|
"step": 11860
|
|
},
|
|
{
|
|
"entropy": 5.684914779663086,
|
|
"epoch": 0.9968494013862634,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004906568840865576,
|
|
"loss": 5.4319,
|
|
"mean_token_accuracy": 0.16591923534870148,
|
|
"num_tokens": 21894315.0,
|
|
"step": 11865
|
|
},
|
|
{
|
|
"entropy": 5.64821228981018,
|
|
"epoch": 0.9972694812014282,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004906483421264305,
|
|
"loss": 5.5444,
|
|
"mean_token_accuracy": 0.16124298125505448,
|
|
"num_tokens": 21903342.0,
|
|
"step": 11870
|
|
},
|
|
{
|
|
"entropy": 5.722136449813843,
|
|
"epoch": 0.9976895610165931,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.000490639796346204,
|
|
"loss": 5.6689,
|
|
"mean_token_accuracy": 0.15262537002563475,
|
|
"num_tokens": 21914158.0,
|
|
"step": 11875
|
|
},
|
|
{
|
|
"entropy": 5.826263189315796,
|
|
"epoch": 0.998109640831758,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004906312467460297,
|
|
"loss": 5.5359,
|
|
"mean_token_accuracy": 0.16018486469984056,
|
|
"num_tokens": 21922639.0,
|
|
"step": 11880
|
|
},
|
|
{
|
|
"entropy": 5.733368921279907,
|
|
"epoch": 0.9985297206469229,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004906226933260588,
|
|
"loss": 5.5305,
|
|
"mean_token_accuracy": 0.1628845065832138,
|
|
"num_tokens": 21931385.0,
|
|
"step": 11885
|
|
},
|
|
{
|
|
"entropy": 5.763300991058349,
|
|
"epoch": 0.9989498004620878,
|
|
"grad_norm": 2.53125,
|
|
"learning_rate": 0.0004906141360864429,
|
|
"loss": 5.5486,
|
|
"mean_token_accuracy": 0.1582605332136154,
|
|
"num_tokens": 21940788.0,
|
|
"step": 11890
|
|
},
|
|
{
|
|
"entropy": 5.750997114181518,
|
|
"epoch": 0.9993698802772527,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004906055750273336,
|
|
"loss": 5.5633,
|
|
"mean_token_accuracy": 0.15954309850931167,
|
|
"num_tokens": 21950309.0,
|
|
"step": 11895
|
|
},
|
|
{
|
|
"entropy": 5.696226406097412,
|
|
"epoch": 0.9997899600924176,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004905970101488826,
|
|
"loss": 5.5533,
|
|
"mean_token_accuracy": 0.16005461364984513,
|
|
"num_tokens": 21959141.0,
|
|
"step": 11900
|
|
},
|
|
{
|
|
"entropy": 5.797101603613959,
|
|
"epoch": 1.000168031926066,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004905884414512416,
|
|
"loss": 5.5832,
|
|
"mean_token_accuracy": 0.16606505546304914,
|
|
"num_tokens": 21966665.0,
|
|
"step": 11905
|
|
},
|
|
{
|
|
"entropy": 5.731563329696655,
|
|
"epoch": 1.0005881117412307,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004905798689345623,
|
|
"loss": 5.5626,
|
|
"mean_token_accuracy": 0.16171082034707068,
|
|
"num_tokens": 21976728.0,
|
|
"step": 11910
|
|
},
|
|
{
|
|
"entropy": 5.629949522018433,
|
|
"epoch": 1.0010081915563958,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004905712925989968,
|
|
"loss": 5.3915,
|
|
"mean_token_accuracy": 0.1608661249279976,
|
|
"num_tokens": 21985915.0,
|
|
"step": 11915
|
|
},
|
|
{
|
|
"entropy": 5.7110429286956785,
|
|
"epoch": 1.0014282713715605,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004905627124446967,
|
|
"loss": 5.4499,
|
|
"mean_token_accuracy": 0.1611791133880615,
|
|
"num_tokens": 21995826.0,
|
|
"step": 11920
|
|
},
|
|
{
|
|
"entropy": 5.700401496887207,
|
|
"epoch": 1.0018483511867255,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004905541284718142,
|
|
"loss": 5.418,
|
|
"mean_token_accuracy": 0.16362681239843369,
|
|
"num_tokens": 22005299.0,
|
|
"step": 11925
|
|
},
|
|
{
|
|
"entropy": 5.667144775390625,
|
|
"epoch": 1.0022684310018903,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004905455406805011,
|
|
"loss": 5.4529,
|
|
"mean_token_accuracy": 0.16137843877077102,
|
|
"num_tokens": 22014499.0,
|
|
"step": 11930
|
|
},
|
|
{
|
|
"entropy": 5.7403564453125,
|
|
"epoch": 1.0026885108170553,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.00049053694907091,
|
|
"loss": 5.6015,
|
|
"mean_token_accuracy": 0.15019496381282807,
|
|
"num_tokens": 22024531.0,
|
|
"step": 11935
|
|
},
|
|
{
|
|
"entropy": 5.714895439147949,
|
|
"epoch": 1.0031085906322201,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004905283536431928,
|
|
"loss": 5.5081,
|
|
"mean_token_accuracy": 0.16284531652927398,
|
|
"num_tokens": 22034036.0,
|
|
"step": 11940
|
|
},
|
|
{
|
|
"entropy": 5.638365077972412,
|
|
"epoch": 1.003528670447385,
|
|
"grad_norm": 4.3125,
|
|
"learning_rate": 0.0004905197543975017,
|
|
"loss": 5.4115,
|
|
"mean_token_accuracy": 0.1633414715528488,
|
|
"num_tokens": 22042910.0,
|
|
"step": 11945
|
|
},
|
|
{
|
|
"entropy": 5.738863801956176,
|
|
"epoch": 1.00394875026255,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004905111513339892,
|
|
"loss": 5.5209,
|
|
"mean_token_accuracy": 0.15980954617261886,
|
|
"num_tokens": 22052242.0,
|
|
"step": 11950
|
|
},
|
|
{
|
|
"entropy": 5.7258447170257565,
|
|
"epoch": 1.0043688300777147,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004905025444528076,
|
|
"loss": 5.4832,
|
|
"mean_token_accuracy": 0.15796915143728257,
|
|
"num_tokens": 22061467.0,
|
|
"step": 11955
|
|
},
|
|
{
|
|
"entropy": 5.585642290115357,
|
|
"epoch": 1.0047889098928797,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004904939337541093,
|
|
"loss": 5.3351,
|
|
"mean_token_accuracy": 0.16610788106918334,
|
|
"num_tokens": 22070300.0,
|
|
"step": 11960
|
|
},
|
|
{
|
|
"entropy": 5.718946933746338,
|
|
"epoch": 1.0052089897080445,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004904853192380472,
|
|
"loss": 5.4909,
|
|
"mean_token_accuracy": 0.16344053149223328,
|
|
"num_tokens": 22078960.0,
|
|
"step": 11965
|
|
},
|
|
{
|
|
"entropy": 5.742123985290528,
|
|
"epoch": 1.0056290695232095,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004904767009047733,
|
|
"loss": 5.4348,
|
|
"mean_token_accuracy": 0.1622706487774849,
|
|
"num_tokens": 22088135.0,
|
|
"step": 11970
|
|
},
|
|
{
|
|
"entropy": 5.737835264205932,
|
|
"epoch": 1.0060491493383743,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004904680787544408,
|
|
"loss": 5.5541,
|
|
"mean_token_accuracy": 0.15428006947040557,
|
|
"num_tokens": 22098004.0,
|
|
"step": 11975
|
|
},
|
|
{
|
|
"entropy": 5.801382684707642,
|
|
"epoch": 1.006469229153539,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004904594527872022,
|
|
"loss": 5.5298,
|
|
"mean_token_accuracy": 0.15679605156183243,
|
|
"num_tokens": 22107680.0,
|
|
"step": 11980
|
|
},
|
|
{
|
|
"entropy": 5.745111274719238,
|
|
"epoch": 1.006889308968704,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004904508230032103,
|
|
"loss": 5.5311,
|
|
"mean_token_accuracy": 0.16063058525323867,
|
|
"num_tokens": 22118004.0,
|
|
"step": 11985
|
|
},
|
|
{
|
|
"entropy": 5.689800071716308,
|
|
"epoch": 1.0073093887838689,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.000490442189402618,
|
|
"loss": 5.4625,
|
|
"mean_token_accuracy": 0.1709965944290161,
|
|
"num_tokens": 22127825.0,
|
|
"step": 11990
|
|
},
|
|
{
|
|
"entropy": 5.684913110733032,
|
|
"epoch": 1.007729468599034,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004904335519855783,
|
|
"loss": 5.3906,
|
|
"mean_token_accuracy": 0.1667776048183441,
|
|
"num_tokens": 22136448.0,
|
|
"step": 11995
|
|
},
|
|
{
|
|
"entropy": 5.670077896118164,
|
|
"epoch": 1.0081495484141987,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004904249107522442,
|
|
"loss": 5.5095,
|
|
"mean_token_accuracy": 0.15952253490686416,
|
|
"num_tokens": 22146415.0,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"epoch": 1.0081495484141987,
|
|
"eval_entropy": 5.499641510231085,
|
|
"eval_loss": 5.561922073364258,
|
|
"eval_mean_token_accuracy": 0.16659502685389052,
|
|
"eval_num_tokens": 22146415.0,
|
|
"eval_runtime": 27.1007,
|
|
"eval_samples_per_second": 1378.784,
|
|
"eval_steps_per_second": 172.357,
|
|
"step": 12000
|
|
},
|
|
{
|
|
"entropy": 5.780040836334228,
|
|
"epoch": 1.0085696282293637,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004904162657027685,
|
|
"loss": 5.6277,
|
|
"mean_token_accuracy": 0.16104299575090408,
|
|
"num_tokens": 22156327.0,
|
|
"step": 12005
|
|
},
|
|
{
|
|
"entropy": 5.666675329208374,
|
|
"epoch": 1.0089897080445285,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004904076168373049,
|
|
"loss": 5.4484,
|
|
"mean_token_accuracy": 0.1636186048388481,
|
|
"num_tokens": 22165677.0,
|
|
"step": 12010
|
|
},
|
|
{
|
|
"entropy": 5.724397230148315,
|
|
"epoch": 1.0094097878596933,
|
|
"grad_norm": 2.453125,
|
|
"learning_rate": 0.0004903989641560061,
|
|
"loss": 5.5786,
|
|
"mean_token_accuracy": 0.16150926798582077,
|
|
"num_tokens": 22175232.0,
|
|
"step": 12015
|
|
},
|
|
{
|
|
"entropy": 5.721542501449585,
|
|
"epoch": 1.0098298676748583,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004903903076590256,
|
|
"loss": 5.461,
|
|
"mean_token_accuracy": 0.15944595038890838,
|
|
"num_tokens": 22184026.0,
|
|
"step": 12020
|
|
},
|
|
{
|
|
"entropy": 5.613580513000488,
|
|
"epoch": 1.010249947490023,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004903816473465167,
|
|
"loss": 5.3558,
|
|
"mean_token_accuracy": 0.1735146701335907,
|
|
"num_tokens": 22192020.0,
|
|
"step": 12025
|
|
},
|
|
{
|
|
"entropy": 5.620361232757569,
|
|
"epoch": 1.010670027305188,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004903729832186328,
|
|
"loss": 5.3474,
|
|
"mean_token_accuracy": 0.17106474488973616,
|
|
"num_tokens": 22200060.0,
|
|
"step": 12030
|
|
},
|
|
{
|
|
"entropy": 5.619206380844116,
|
|
"epoch": 1.0110901071203529,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004903643152755274,
|
|
"loss": 5.3797,
|
|
"mean_token_accuracy": 0.16311112642288209,
|
|
"num_tokens": 22208625.0,
|
|
"step": 12035
|
|
},
|
|
{
|
|
"entropy": 5.636928939819336,
|
|
"epoch": 1.0115101869355176,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004903556435173541,
|
|
"loss": 5.3741,
|
|
"mean_token_accuracy": 0.1699930876493454,
|
|
"num_tokens": 22217781.0,
|
|
"step": 12040
|
|
},
|
|
{
|
|
"entropy": 5.725437021255493,
|
|
"epoch": 1.0119302667506826,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004903469679442665,
|
|
"loss": 5.5068,
|
|
"mean_token_accuracy": 0.1630315825343132,
|
|
"num_tokens": 22226432.0,
|
|
"step": 12045
|
|
},
|
|
{
|
|
"entropy": 5.628535366058349,
|
|
"epoch": 1.0123503465658474,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004903382885564181,
|
|
"loss": 5.4978,
|
|
"mean_token_accuracy": 0.1623180940747261,
|
|
"num_tokens": 22234811.0,
|
|
"step": 12050
|
|
},
|
|
{
|
|
"entropy": 5.606299209594726,
|
|
"epoch": 1.0127704263810124,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000490329605353963,
|
|
"loss": 5.3746,
|
|
"mean_token_accuracy": 0.1714473158121109,
|
|
"num_tokens": 22242808.0,
|
|
"step": 12055
|
|
},
|
|
{
|
|
"entropy": 5.720843267440796,
|
|
"epoch": 1.0131905061961772,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004903209183370547,
|
|
"loss": 5.4519,
|
|
"mean_token_accuracy": 0.16782257854938507,
|
|
"num_tokens": 22251371.0,
|
|
"step": 12060
|
|
},
|
|
{
|
|
"entropy": 5.802830362319947,
|
|
"epoch": 1.0136105860113422,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004903122275058472,
|
|
"loss": 5.5244,
|
|
"mean_token_accuracy": 0.16609688699245453,
|
|
"num_tokens": 22260868.0,
|
|
"step": 12065
|
|
},
|
|
{
|
|
"entropy": 5.656514883041382,
|
|
"epoch": 1.014030665826507,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004903035328604944,
|
|
"loss": 5.4333,
|
|
"mean_token_accuracy": 0.16029565781354904,
|
|
"num_tokens": 22270554.0,
|
|
"step": 12070
|
|
},
|
|
{
|
|
"entropy": 5.644852542877198,
|
|
"epoch": 1.0144507456416718,
|
|
"grad_norm": 2.484375,
|
|
"learning_rate": 0.0004902948344011506,
|
|
"loss": 5.4394,
|
|
"mean_token_accuracy": 0.1594587244093418,
|
|
"num_tokens": 22279170.0,
|
|
"step": 12075
|
|
},
|
|
{
|
|
"entropy": 5.762671756744385,
|
|
"epoch": 1.0148708254568368,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004902861321279694,
|
|
"loss": 5.5814,
|
|
"mean_token_accuracy": 0.15654578879475595,
|
|
"num_tokens": 22288788.0,
|
|
"step": 12080
|
|
},
|
|
{
|
|
"entropy": 5.685765647888184,
|
|
"epoch": 1.0152909052720016,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004902774260411055,
|
|
"loss": 5.3707,
|
|
"mean_token_accuracy": 0.1631255716085434,
|
|
"num_tokens": 22297501.0,
|
|
"step": 12085
|
|
},
|
|
{
|
|
"entropy": 5.645720195770264,
|
|
"epoch": 1.0157109850871666,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004902687161407126,
|
|
"loss": 5.3224,
|
|
"mean_token_accuracy": 0.17397422790527345,
|
|
"num_tokens": 22306181.0,
|
|
"step": 12090
|
|
},
|
|
{
|
|
"entropy": 5.625262689590454,
|
|
"epoch": 1.0161310649023314,
|
|
"grad_norm": 3.046875,
|
|
"learning_rate": 0.0004902600024269454,
|
|
"loss": 5.4623,
|
|
"mean_token_accuracy": 0.16882133185863496,
|
|
"num_tokens": 22315762.0,
|
|
"step": 12095
|
|
},
|
|
{
|
|
"entropy": 5.65147967338562,
|
|
"epoch": 1.0165511447174964,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.000490251284899958,
|
|
"loss": 5.4041,
|
|
"mean_token_accuracy": 0.17007135301828386,
|
|
"num_tokens": 22325127.0,
|
|
"step": 12100
|
|
},
|
|
{
|
|
"entropy": 5.6629842758178714,
|
|
"epoch": 1.0169712245326612,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.000490242563559905,
|
|
"loss": 5.4974,
|
|
"mean_token_accuracy": 0.16227589547634125,
|
|
"num_tokens": 22334038.0,
|
|
"step": 12105
|
|
},
|
|
{
|
|
"entropy": 5.6420886516571045,
|
|
"epoch": 1.017391304347826,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004902338384069408,
|
|
"loss": 5.3571,
|
|
"mean_token_accuracy": 0.16831498742103576,
|
|
"num_tokens": 22342658.0,
|
|
"step": 12110
|
|
},
|
|
{
|
|
"entropy": 5.773781061172485,
|
|
"epoch": 1.017811384162991,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00049022510944122,
|
|
"loss": 5.5329,
|
|
"mean_token_accuracy": 0.1544503778219223,
|
|
"num_tokens": 22352559.0,
|
|
"step": 12115
|
|
},
|
|
{
|
|
"entropy": 5.733228349685669,
|
|
"epoch": 1.0182314639781558,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004902163766628972,
|
|
"loss": 5.448,
|
|
"mean_token_accuracy": 0.16392027586698532,
|
|
"num_tokens": 22361455.0,
|
|
"step": 12120
|
|
},
|
|
{
|
|
"entropy": 5.713450241088867,
|
|
"epoch": 1.0186515437933208,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004902076400721271,
|
|
"loss": 5.4895,
|
|
"mean_token_accuracy": 0.16010515689849852,
|
|
"num_tokens": 22371163.0,
|
|
"step": 12125
|
|
},
|
|
{
|
|
"entropy": 5.771074008941651,
|
|
"epoch": 1.0190716236084856,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004901988996690645,
|
|
"loss": 5.4849,
|
|
"mean_token_accuracy": 0.16219335049390793,
|
|
"num_tokens": 22379975.0,
|
|
"step": 12130
|
|
},
|
|
{
|
|
"entropy": 5.794712877273559,
|
|
"epoch": 1.0194917034236506,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004901901554538641,
|
|
"loss": 5.5188,
|
|
"mean_token_accuracy": 0.16042946726083757,
|
|
"num_tokens": 22389657.0,
|
|
"step": 12135
|
|
},
|
|
{
|
|
"entropy": 5.588501882553101,
|
|
"epoch": 1.0199117832388154,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.000490181407426681,
|
|
"loss": 5.341,
|
|
"mean_token_accuracy": 0.17044000029563905,
|
|
"num_tokens": 22398320.0,
|
|
"step": 12140
|
|
},
|
|
{
|
|
"entropy": 5.666534328460694,
|
|
"epoch": 1.0203318630539802,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004901726555876701,
|
|
"loss": 5.5372,
|
|
"mean_token_accuracy": 0.15764785856008529,
|
|
"num_tokens": 22406634.0,
|
|
"step": 12145
|
|
},
|
|
{
|
|
"entropy": 5.775462627410889,
|
|
"epoch": 1.0207519428691452,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004901638999369862,
|
|
"loss": 5.5891,
|
|
"mean_token_accuracy": 0.1614912211894989,
|
|
"num_tokens": 22415939.0,
|
|
"step": 12150
|
|
},
|
|
{
|
|
"entropy": 5.739730596542358,
|
|
"epoch": 1.02117202268431,
|
|
"grad_norm": 2.140625,
|
|
"learning_rate": 0.0004901551404747847,
|
|
"loss": 5.5275,
|
|
"mean_token_accuracy": 0.15698569566011428,
|
|
"num_tokens": 22425256.0,
|
|
"step": 12155
|
|
},
|
|
{
|
|
"entropy": 5.706746006011963,
|
|
"epoch": 1.021592102499475,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004901463772012209,
|
|
"loss": 5.5684,
|
|
"mean_token_accuracy": 0.15766305550932885,
|
|
"num_tokens": 22434750.0,
|
|
"step": 12160
|
|
},
|
|
{
|
|
"entropy": 5.704181098937989,
|
|
"epoch": 1.0220121823146397,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004901376101164495,
|
|
"loss": 5.4751,
|
|
"mean_token_accuracy": 0.15738504827022554,
|
|
"num_tokens": 22443426.0,
|
|
"step": 12165
|
|
},
|
|
{
|
|
"entropy": 5.729747581481933,
|
|
"epoch": 1.0224322621298048,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004901288392206263,
|
|
"loss": 5.4715,
|
|
"mean_token_accuracy": 0.15853249728679658,
|
|
"num_tokens": 22452778.0,
|
|
"step": 12170
|
|
},
|
|
{
|
|
"entropy": 5.6724381923675535,
|
|
"epoch": 1.0228523419449695,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004901200645139064,
|
|
"loss": 5.4361,
|
|
"mean_token_accuracy": 0.17019454389810562,
|
|
"num_tokens": 22462864.0,
|
|
"step": 12175
|
|
},
|
|
{
|
|
"entropy": 5.674047183990479,
|
|
"epoch": 1.0232724217601343,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004901112859964454,
|
|
"loss": 5.4971,
|
|
"mean_token_accuracy": 0.1562313809990883,
|
|
"num_tokens": 22472849.0,
|
|
"step": 12180
|
|
},
|
|
{
|
|
"entropy": 5.676679944992065,
|
|
"epoch": 1.0236925015752993,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004901025036683987,
|
|
"loss": 5.4195,
|
|
"mean_token_accuracy": 0.1597939297556877,
|
|
"num_tokens": 22481693.0,
|
|
"step": 12185
|
|
},
|
|
{
|
|
"entropy": 5.6579413414001465,
|
|
"epoch": 1.0241125813904641,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004900937175299219,
|
|
"loss": 5.4204,
|
|
"mean_token_accuracy": 0.16194331496953965,
|
|
"num_tokens": 22490934.0,
|
|
"step": 12190
|
|
},
|
|
{
|
|
"entropy": 5.62444052696228,
|
|
"epoch": 1.0245326612056291,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004900849275811707,
|
|
"loss": 5.4767,
|
|
"mean_token_accuracy": 0.15643872767686845,
|
|
"num_tokens": 22500457.0,
|
|
"step": 12195
|
|
},
|
|
{
|
|
"entropy": 5.7097996234893795,
|
|
"epoch": 1.024952741020794,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004900761338223007,
|
|
"loss": 5.4013,
|
|
"mean_token_accuracy": 0.16027526408433915,
|
|
"num_tokens": 22509641.0,
|
|
"step": 12200
|
|
},
|
|
{
|
|
"entropy": 5.655760145187378,
|
|
"epoch": 1.025372820835959,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004900673362534677,
|
|
"loss": 5.347,
|
|
"mean_token_accuracy": 0.17173382937908171,
|
|
"num_tokens": 22518616.0,
|
|
"step": 12205
|
|
},
|
|
{
|
|
"entropy": 5.6903989791870115,
|
|
"epoch": 1.0257929006511237,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004900585348748277,
|
|
"loss": 5.4952,
|
|
"mean_token_accuracy": 0.16641637086868286,
|
|
"num_tokens": 22527599.0,
|
|
"step": 12210
|
|
},
|
|
{
|
|
"entropy": 5.6446558952331545,
|
|
"epoch": 1.0262129804662885,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004900497296865365,
|
|
"loss": 5.5004,
|
|
"mean_token_accuracy": 0.1569361686706543,
|
|
"num_tokens": 22537399.0,
|
|
"step": 12215
|
|
},
|
|
{
|
|
"entropy": 5.80555911064148,
|
|
"epoch": 1.0266330602814535,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0004900409206887499,
|
|
"loss": 5.6797,
|
|
"mean_token_accuracy": 0.15186302959918976,
|
|
"num_tokens": 22546746.0,
|
|
"step": 12220
|
|
},
|
|
{
|
|
"entropy": 5.7677764892578125,
|
|
"epoch": 1.0270531400966183,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0004900321078816243,
|
|
"loss": 5.4782,
|
|
"mean_token_accuracy": 0.16736462563276291,
|
|
"num_tokens": 22555735.0,
|
|
"step": 12225
|
|
},
|
|
{
|
|
"entropy": 5.750488138198852,
|
|
"epoch": 1.0274732199117833,
|
|
"grad_norm": 2.40625,
|
|
"learning_rate": 0.0004900232912653156,
|
|
"loss": 5.4762,
|
|
"mean_token_accuracy": 0.16161205470561982,
|
|
"num_tokens": 22565010.0,
|
|
"step": 12230
|
|
},
|
|
{
|
|
"entropy": 5.669838380813599,
|
|
"epoch": 1.027893299726948,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.00049001447083998,
|
|
"loss": 5.4535,
|
|
"mean_token_accuracy": 0.1628490686416626,
|
|
"num_tokens": 22573565.0,
|
|
"step": 12235
|
|
},
|
|
{
|
|
"entropy": 5.709480333328247,
|
|
"epoch": 1.028313379542113,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004900056466057737,
|
|
"loss": 5.4534,
|
|
"mean_token_accuracy": 0.15907117128372192,
|
|
"num_tokens": 22582549.0,
|
|
"step": 12240
|
|
},
|
|
{
|
|
"entropy": 5.680795192718506,
|
|
"epoch": 1.028733459357278,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004899968185628531,
|
|
"loss": 5.5212,
|
|
"mean_token_accuracy": 0.1543809860944748,
|
|
"num_tokens": 22592112.0,
|
|
"step": 12245
|
|
},
|
|
{
|
|
"entropy": 5.6241144180297855,
|
|
"epoch": 1.0291535391724427,
|
|
"grad_norm": 7.40625,
|
|
"learning_rate": 0.0004899879867113746,
|
|
"loss": 5.3522,
|
|
"mean_token_accuracy": 0.1685326635837555,
|
|
"num_tokens": 22600581.0,
|
|
"step": 12250
|
|
},
|
|
{
|
|
"entropy": 5.778968381881714,
|
|
"epoch": 1.0295736189876077,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004899791510514945,
|
|
"loss": 5.5624,
|
|
"mean_token_accuracy": 0.1573510617017746,
|
|
"num_tokens": 22610822.0,
|
|
"step": 12255
|
|
},
|
|
{
|
|
"entropy": 5.718343067169189,
|
|
"epoch": 1.0299936988027725,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.0004899703115833696,
|
|
"loss": 5.5635,
|
|
"mean_token_accuracy": 0.15985433757305145,
|
|
"num_tokens": 22619484.0,
|
|
"step": 12260
|
|
},
|
|
{
|
|
"entropy": 5.702852535247803,
|
|
"epoch": 1.0304137786179375,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0004899614683071563,
|
|
"loss": 5.4026,
|
|
"mean_token_accuracy": 0.16532386988401412,
|
|
"num_tokens": 22629038.0,
|
|
"step": 12265
|
|
},
|
|
{
|
|
"entropy": 5.710755681991577,
|
|
"epoch": 1.0308338584331023,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004899526212230112,
|
|
"loss": 5.5015,
|
|
"mean_token_accuracy": 0.15492028892040252,
|
|
"num_tokens": 22638619.0,
|
|
"step": 12270
|
|
},
|
|
{
|
|
"entropy": 5.6497485637664795,
|
|
"epoch": 1.0312539382482673,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004899437703310912,
|
|
"loss": 5.4953,
|
|
"mean_token_accuracy": 0.15784212797880173,
|
|
"num_tokens": 22648065.0,
|
|
"step": 12275
|
|
},
|
|
{
|
|
"entropy": 5.785039758682251,
|
|
"epoch": 1.031674018063432,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004899349156315529,
|
|
"loss": 5.5548,
|
|
"mean_token_accuracy": 0.15540689527988433,
|
|
"num_tokens": 22658107.0,
|
|
"step": 12280
|
|
},
|
|
{
|
|
"entropy": 5.763218879699707,
|
|
"epoch": 1.0320940978785969,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004899260571245533,
|
|
"loss": 5.4364,
|
|
"mean_token_accuracy": 0.15831174105405807,
|
|
"num_tokens": 22667103.0,
|
|
"step": 12285
|
|
},
|
|
{
|
|
"entropy": 5.657338285446167,
|
|
"epoch": 1.0325141776937619,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004899171948102492,
|
|
"loss": 5.4215,
|
|
"mean_token_accuracy": 0.1609957903623581,
|
|
"num_tokens": 22676792.0,
|
|
"step": 12290
|
|
},
|
|
{
|
|
"entropy": 5.659443616867065,
|
|
"epoch": 1.0329342575089266,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004899083286887977,
|
|
"loss": 5.4341,
|
|
"mean_token_accuracy": 0.16280112117528917,
|
|
"num_tokens": 22685344.0,
|
|
"step": 12295
|
|
},
|
|
{
|
|
"entropy": 5.744201564788819,
|
|
"epoch": 1.0333543373240917,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004898994587603559,
|
|
"loss": 5.494,
|
|
"mean_token_accuracy": 0.16114521920681,
|
|
"num_tokens": 22694387.0,
|
|
"step": 12300
|
|
},
|
|
{
|
|
"entropy": 5.676422786712647,
|
|
"epoch": 1.0337744171392564,
|
|
"grad_norm": 2.671875,
|
|
"learning_rate": 0.0004898905850250807,
|
|
"loss": 5.5338,
|
|
"mean_token_accuracy": 0.15966002494096757,
|
|
"num_tokens": 22704203.0,
|
|
"step": 12305
|
|
},
|
|
{
|
|
"entropy": 5.766281652450561,
|
|
"epoch": 1.0341944969544214,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004898817074831295,
|
|
"loss": 5.5864,
|
|
"mean_token_accuracy": 0.15488215833902358,
|
|
"num_tokens": 22713518.0,
|
|
"step": 12310
|
|
},
|
|
{
|
|
"entropy": 5.786115884780884,
|
|
"epoch": 1.0346145767695862,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004898728261346595,
|
|
"loss": 5.5937,
|
|
"mean_token_accuracy": 0.15901960879564286,
|
|
"num_tokens": 22722997.0,
|
|
"step": 12315
|
|
},
|
|
{
|
|
"entropy": 5.779094600677491,
|
|
"epoch": 1.035034656584751,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.000489863940979828,
|
|
"loss": 5.517,
|
|
"mean_token_accuracy": 0.15875154137611389,
|
|
"num_tokens": 22732385.0,
|
|
"step": 12320
|
|
},
|
|
{
|
|
"entropy": 5.638829135894776,
|
|
"epoch": 1.035454736399916,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.0004898550520187925,
|
|
"loss": 5.3829,
|
|
"mean_token_accuracy": 0.16449802070856095,
|
|
"num_tokens": 22741148.0,
|
|
"step": 12325
|
|
},
|
|
{
|
|
"entropy": 5.625936222076416,
|
|
"epoch": 1.0358748162150808,
|
|
"grad_norm": 2.109375,
|
|
"learning_rate": 0.0004898461592517103,
|
|
"loss": 5.3933,
|
|
"mean_token_accuracy": 0.16470819115638732,
|
|
"num_tokens": 22750239.0,
|
|
"step": 12330
|
|
},
|
|
{
|
|
"entropy": 5.745430994033813,
|
|
"epoch": 1.0362948960302458,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004898372626787391,
|
|
"loss": 5.5446,
|
|
"mean_token_accuracy": 0.1539967328310013,
|
|
"num_tokens": 22759290.0,
|
|
"step": 12335
|
|
},
|
|
{
|
|
"entropy": 5.816291427612304,
|
|
"epoch": 1.0367149758454106,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004898283623000364,
|
|
"loss": 5.5656,
|
|
"mean_token_accuracy": 0.15133108496665953,
|
|
"num_tokens": 22768450.0,
|
|
"step": 12340
|
|
},
|
|
{
|
|
"entropy": 5.727371311187744,
|
|
"epoch": 1.0371350556605754,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.0004898194581157598,
|
|
"loss": 5.4337,
|
|
"mean_token_accuracy": 0.1574019357562065,
|
|
"num_tokens": 22777711.0,
|
|
"step": 12345
|
|
},
|
|
{
|
|
"entropy": 5.715303945541382,
|
|
"epoch": 1.0375551354757404,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004898105501260671,
|
|
"loss": 5.5086,
|
|
"mean_token_accuracy": 0.15961914360523224,
|
|
"num_tokens": 22787153.0,
|
|
"step": 12350
|
|
},
|
|
{
|
|
"entropy": 5.750590991973877,
|
|
"epoch": 1.0379752152909052,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004898016383311163,
|
|
"loss": 5.5032,
|
|
"mean_token_accuracy": 0.16274025440216064,
|
|
"num_tokens": 22797125.0,
|
|
"step": 12355
|
|
},
|
|
{
|
|
"entropy": 5.755704784393311,
|
|
"epoch": 1.0383952951060702,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.000489792722731065,
|
|
"loss": 5.5023,
|
|
"mean_token_accuracy": 0.16141920089721679,
|
|
"num_tokens": 22806478.0,
|
|
"step": 12360
|
|
},
|
|
{
|
|
"entropy": 5.763215065002441,
|
|
"epoch": 1.038815374921235,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004897838033260712,
|
|
"loss": 5.5097,
|
|
"mean_token_accuracy": 0.1506726086139679,
|
|
"num_tokens": 22815375.0,
|
|
"step": 12365
|
|
},
|
|
{
|
|
"entropy": 5.749024534225464,
|
|
"epoch": 1.0392354547364,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004897748801162929,
|
|
"loss": 5.48,
|
|
"mean_token_accuracy": 0.1599726364016533,
|
|
"num_tokens": 22824401.0,
|
|
"step": 12370
|
|
},
|
|
{
|
|
"entropy": 5.738317537307739,
|
|
"epoch": 1.0396555345515648,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004897659531018882,
|
|
"loss": 5.5881,
|
|
"mean_token_accuracy": 0.16056930348277093,
|
|
"num_tokens": 22833933.0,
|
|
"step": 12375
|
|
},
|
|
{
|
|
"entropy": 5.668944311141968,
|
|
"epoch": 1.0400756143667296,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004897570222830152,
|
|
"loss": 5.466,
|
|
"mean_token_accuracy": 0.15872760713100434,
|
|
"num_tokens": 22843779.0,
|
|
"step": 12380
|
|
},
|
|
{
|
|
"entropy": 5.750452899932862,
|
|
"epoch": 1.0404956941818946,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004897480876598322,
|
|
"loss": 5.5429,
|
|
"mean_token_accuracy": 0.15892354696989058,
|
|
"num_tokens": 22852951.0,
|
|
"step": 12385
|
|
},
|
|
{
|
|
"entropy": 5.7894142150878904,
|
|
"epoch": 1.0409157739970594,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004897391492324974,
|
|
"loss": 5.5579,
|
|
"mean_token_accuracy": 0.15414744317531587,
|
|
"num_tokens": 22861398.0,
|
|
"step": 12390
|
|
},
|
|
{
|
|
"entropy": 5.685288190841675,
|
|
"epoch": 1.0413358538122244,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004897302070011691,
|
|
"loss": 5.443,
|
|
"mean_token_accuracy": 0.16238410770893097,
|
|
"num_tokens": 22870518.0,
|
|
"step": 12395
|
|
},
|
|
{
|
|
"entropy": 5.6724052906036375,
|
|
"epoch": 1.0417559336273892,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004897212609660058,
|
|
"loss": 5.5142,
|
|
"mean_token_accuracy": 0.1540789134800434,
|
|
"num_tokens": 22879389.0,
|
|
"step": 12400
|
|
},
|
|
{
|
|
"entropy": 5.71264681816101,
|
|
"epoch": 1.0421760134425542,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004897123111271659,
|
|
"loss": 5.5202,
|
|
"mean_token_accuracy": 0.16525834202766418,
|
|
"num_tokens": 22888977.0,
|
|
"step": 12405
|
|
},
|
|
{
|
|
"entropy": 5.797454452514648,
|
|
"epoch": 1.042596093257719,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.0004897033574848079,
|
|
"loss": 5.5219,
|
|
"mean_token_accuracy": 0.1619536757469177,
|
|
"num_tokens": 22898446.0,
|
|
"step": 12410
|
|
},
|
|
{
|
|
"entropy": 5.679445219039917,
|
|
"epoch": 1.0430161730728837,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004896944000390907,
|
|
"loss": 5.5193,
|
|
"mean_token_accuracy": 0.160529488325119,
|
|
"num_tokens": 22908044.0,
|
|
"step": 12415
|
|
},
|
|
{
|
|
"entropy": 5.76577320098877,
|
|
"epoch": 1.0434362528880488,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004896854387901725,
|
|
"loss": 5.5744,
|
|
"mean_token_accuracy": 0.1569056034088135,
|
|
"num_tokens": 22917330.0,
|
|
"step": 12420
|
|
},
|
|
{
|
|
"entropy": 5.791294860839844,
|
|
"epoch": 1.0438563327032135,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004896764737382124,
|
|
"loss": 5.5241,
|
|
"mean_token_accuracy": 0.165054389834404,
|
|
"num_tokens": 22927160.0,
|
|
"step": 12425
|
|
},
|
|
{
|
|
"entropy": 5.762517213821411,
|
|
"epoch": 1.0442764125183785,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004896675048833691,
|
|
"loss": 5.4934,
|
|
"mean_token_accuracy": 0.15579424053430557,
|
|
"num_tokens": 22936755.0,
|
|
"step": 12430
|
|
},
|
|
{
|
|
"entropy": 5.722726345062256,
|
|
"epoch": 1.0446964923335433,
|
|
"grad_norm": 1.9765625,
|
|
"learning_rate": 0.0004896585322258014,
|
|
"loss": 5.4676,
|
|
"mean_token_accuracy": 0.16214157938957213,
|
|
"num_tokens": 22945699.0,
|
|
"step": 12435
|
|
},
|
|
{
|
|
"entropy": 5.719200086593628,
|
|
"epoch": 1.0451165721487083,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004896495557656685,
|
|
"loss": 5.4392,
|
|
"mean_token_accuracy": 0.16894277483224868,
|
|
"num_tokens": 22954001.0,
|
|
"step": 12440
|
|
},
|
|
{
|
|
"entropy": 5.793335008621216,
|
|
"epoch": 1.0455366519638731,
|
|
"grad_norm": 2.921875,
|
|
"learning_rate": 0.0004896405755031293,
|
|
"loss": 5.5349,
|
|
"mean_token_accuracy": 0.15977277606725693,
|
|
"num_tokens": 22963805.0,
|
|
"step": 12445
|
|
},
|
|
{
|
|
"entropy": 5.649032068252564,
|
|
"epoch": 1.045956731779038,
|
|
"grad_norm": 2.65625,
|
|
"learning_rate": 0.0004896315914383427,
|
|
"loss": 5.488,
|
|
"mean_token_accuracy": 0.15751570016145705,
|
|
"num_tokens": 22973542.0,
|
|
"step": 12450
|
|
},
|
|
{
|
|
"entropy": 5.6373741149902346,
|
|
"epoch": 1.046376811594203,
|
|
"grad_norm": 2.8125,
|
|
"learning_rate": 0.0004896226035714679,
|
|
"loss": 5.3747,
|
|
"mean_token_accuracy": 0.1678253784775734,
|
|
"num_tokens": 22982417.0,
|
|
"step": 12455
|
|
},
|
|
{
|
|
"entropy": 5.75918436050415,
|
|
"epoch": 1.0467968914093677,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004896136119026642,
|
|
"loss": 5.4927,
|
|
"mean_token_accuracy": 0.1621846318244934,
|
|
"num_tokens": 22992879.0,
|
|
"step": 12460
|
|
},
|
|
{
|
|
"entropy": 5.684741640090943,
|
|
"epoch": 1.0472169712245327,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004896046164320911,
|
|
"loss": 5.377,
|
|
"mean_token_accuracy": 0.16672905832529067,
|
|
"num_tokens": 23001344.0,
|
|
"step": 12465
|
|
},
|
|
{
|
|
"entropy": 5.665459394454956,
|
|
"epoch": 1.0476370510396975,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.0004895956171599075,
|
|
"loss": 5.4296,
|
|
"mean_token_accuracy": 0.16854447424411773,
|
|
"num_tokens": 23010007.0,
|
|
"step": 12470
|
|
},
|
|
{
|
|
"entropy": 5.713906764984131,
|
|
"epoch": 1.0480571308548625,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004895866140862731,
|
|
"loss": 5.5401,
|
|
"mean_token_accuracy": 0.15899324417114258,
|
|
"num_tokens": 23019120.0,
|
|
"step": 12475
|
|
},
|
|
{
|
|
"entropy": 5.713853216171264,
|
|
"epoch": 1.0484772106700273,
|
|
"grad_norm": 3.265625,
|
|
"learning_rate": 0.0004895776072113473,
|
|
"loss": 5.5248,
|
|
"mean_token_accuracy": 0.1668669253587723,
|
|
"num_tokens": 23028562.0,
|
|
"step": 12480
|
|
},
|
|
{
|
|
"entropy": 5.7258385181427,
|
|
"epoch": 1.048897290485192,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 0.0004895685965352898,
|
|
"loss": 5.4638,
|
|
"mean_token_accuracy": 0.15974617898464202,
|
|
"num_tokens": 23037687.0,
|
|
"step": 12485
|
|
},
|
|
{
|
|
"entropy": 5.784030103683472,
|
|
"epoch": 1.049317370300357,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004895595820582601,
|
|
"loss": 5.4561,
|
|
"mean_token_accuracy": 0.16309544295072556,
|
|
"num_tokens": 23047475.0,
|
|
"step": 12490
|
|
},
|
|
{
|
|
"entropy": 5.745016288757324,
|
|
"epoch": 1.0497374501155219,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004895505637804177,
|
|
"loss": 5.5025,
|
|
"mean_token_accuracy": 0.15667191743850709,
|
|
"num_tokens": 23057475.0,
|
|
"step": 12495
|
|
},
|
|
{
|
|
"entropy": 5.627092552185059,
|
|
"epoch": 1.050157529930687,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004895415417019227,
|
|
"loss": 5.4526,
|
|
"mean_token_accuracy": 0.1620897650718689,
|
|
"num_tokens": 23066419.0,
|
|
"step": 12500
|
|
},
|
|
{
|
|
"entropy": 5.731131649017334,
|
|
"epoch": 1.0505776097458517,
|
|
"grad_norm": 2.6875,
|
|
"learning_rate": 0.0004895325158229346,
|
|
"loss": 5.52,
|
|
"mean_token_accuracy": 0.16126697659492492,
|
|
"num_tokens": 23075516.0,
|
|
"step": 12505
|
|
},
|
|
{
|
|
"entropy": 5.699992942810058,
|
|
"epoch": 1.0509976895610167,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004895234861436136,
|
|
"loss": 5.4051,
|
|
"mean_token_accuracy": 0.1671866923570633,
|
|
"num_tokens": 23084132.0,
|
|
"step": 12510
|
|
},
|
|
{
|
|
"entropy": 5.773825979232788,
|
|
"epoch": 1.0514177693761815,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004895144526641194,
|
|
"loss": 5.4907,
|
|
"mean_token_accuracy": 0.16415202766656875,
|
|
"num_tokens": 23093958.0,
|
|
"step": 12515
|
|
},
|
|
{
|
|
"entropy": 5.825757265090942,
|
|
"epoch": 1.0518378491913463,
|
|
"grad_norm": 3.1875,
|
|
"learning_rate": 0.0004895054153846123,
|
|
"loss": 5.5132,
|
|
"mean_token_accuracy": 0.15762871354818345,
|
|
"num_tokens": 23103524.0,
|
|
"step": 12520
|
|
},
|
|
{
|
|
"entropy": 5.695266485214233,
|
|
"epoch": 1.0522579290065113,
|
|
"grad_norm": 2.265625,
|
|
"learning_rate": 0.0004894963743052521,
|
|
"loss": 5.4528,
|
|
"mean_token_accuracy": 0.1564144730567932,
|
|
"num_tokens": 23112445.0,
|
|
"step": 12525
|
|
},
|
|
{
|
|
"entropy": 5.752588367462158,
|
|
"epoch": 1.052678008821676,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004894873294261991,
|
|
"loss": 5.5023,
|
|
"mean_token_accuracy": 0.15996319204568862,
|
|
"num_tokens": 23121299.0,
|
|
"step": 12530
|
|
},
|
|
{
|
|
"entropy": 5.796125411987305,
|
|
"epoch": 1.053098088636841,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004894782807476134,
|
|
"loss": 5.515,
|
|
"mean_token_accuracy": 0.15295109748840333,
|
|
"num_tokens": 23130260.0,
|
|
"step": 12535
|
|
},
|
|
{
|
|
"entropy": 5.73946213722229,
|
|
"epoch": 1.0535181684520059,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004894692282696555,
|
|
"loss": 5.4445,
|
|
"mean_token_accuracy": 0.16455678939819335,
|
|
"num_tokens": 23139335.0,
|
|
"step": 12540
|
|
},
|
|
{
|
|
"entropy": 5.676587677001953,
|
|
"epoch": 1.0539382482671709,
|
|
"grad_norm": 3.40625,
|
|
"learning_rate": 0.0004894601719924857,
|
|
"loss": 5.4467,
|
|
"mean_token_accuracy": 0.16182892471551896,
|
|
"num_tokens": 23149299.0,
|
|
"step": 12545
|
|
},
|
|
{
|
|
"entropy": 5.657404994964599,
|
|
"epoch": 1.0543583280823356,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004894511119162644,
|
|
"loss": 5.4024,
|
|
"mean_token_accuracy": 0.16588439494371415,
|
|
"num_tokens": 23158651.0,
|
|
"step": 12550
|
|
},
|
|
{
|
|
"entropy": 5.765262174606323,
|
|
"epoch": 1.0547784078975004,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.000489442048041152,
|
|
"loss": 5.5059,
|
|
"mean_token_accuracy": 0.15651110857725142,
|
|
"num_tokens": 23167629.0,
|
|
"step": 12555
|
|
},
|
|
{
|
|
"entropy": 5.784561347961426,
|
|
"epoch": 1.0551984877126654,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004894329803673092,
|
|
"loss": 5.4831,
|
|
"mean_token_accuracy": 0.15658986270427705,
|
|
"num_tokens": 23177026.0,
|
|
"step": 12560
|
|
},
|
|
{
|
|
"entropy": 5.715599107742309,
|
|
"epoch": 1.0556185675278302,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004894239088948964,
|
|
"loss": 5.4563,
|
|
"mean_token_accuracy": 0.16440708339214324,
|
|
"num_tokens": 23185297.0,
|
|
"step": 12565
|
|
},
|
|
{
|
|
"entropy": 5.6783417701721195,
|
|
"epoch": 1.0560386473429952,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.0004894148336240747,
|
|
"loss": 5.4547,
|
|
"mean_token_accuracy": 0.16344198435544968,
|
|
"num_tokens": 23194804.0,
|
|
"step": 12570
|
|
},
|
|
{
|
|
"entropy": 5.733673143386841,
|
|
"epoch": 1.05645872715816,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004894057545550045,
|
|
"loss": 5.5114,
|
|
"mean_token_accuracy": 0.15659749060869216,
|
|
"num_tokens": 23205063.0,
|
|
"step": 12575
|
|
},
|
|
{
|
|
"entropy": 5.70140643119812,
|
|
"epoch": 1.056878806973325,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004893966716878467,
|
|
"loss": 5.4211,
|
|
"mean_token_accuracy": 0.16358228921890258,
|
|
"num_tokens": 23215038.0,
|
|
"step": 12580
|
|
},
|
|
{
|
|
"entropy": 5.809863042831421,
|
|
"epoch": 1.0572988867884898,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004893875850227624,
|
|
"loss": 5.5971,
|
|
"mean_token_accuracy": 0.15614084005355836,
|
|
"num_tokens": 23223530.0,
|
|
"step": 12585
|
|
},
|
|
{
|
|
"entropy": 5.7787669658660885,
|
|
"epoch": 1.0577189666036546,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004893784945599124,
|
|
"loss": 5.5362,
|
|
"mean_token_accuracy": 0.15725200325250627,
|
|
"num_tokens": 23232547.0,
|
|
"step": 12590
|
|
},
|
|
{
|
|
"entropy": 5.7005228996276855,
|
|
"epoch": 1.0581390464188196,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004893694002994577,
|
|
"loss": 5.5717,
|
|
"mean_token_accuracy": 0.15885683745145798,
|
|
"num_tokens": 23241305.0,
|
|
"step": 12595
|
|
},
|
|
{
|
|
"entropy": 5.873926401138306,
|
|
"epoch": 1.0585591262339844,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004893603022415595,
|
|
"loss": 5.5929,
|
|
"mean_token_accuracy": 0.16115136668086052,
|
|
"num_tokens": 23250708.0,
|
|
"step": 12600
|
|
},
|
|
{
|
|
"entropy": 5.781400442123413,
|
|
"epoch": 1.0589792060491494,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004893512003863788,
|
|
"loss": 5.4929,
|
|
"mean_token_accuracy": 0.15788125842809678,
|
|
"num_tokens": 23260161.0,
|
|
"step": 12605
|
|
},
|
|
{
|
|
"entropy": 5.643087482452392,
|
|
"epoch": 1.0593992858643142,
|
|
"grad_norm": 2.546875,
|
|
"learning_rate": 0.0004893420947340771,
|
|
"loss": 5.3975,
|
|
"mean_token_accuracy": 0.15917274206876755,
|
|
"num_tokens": 23268932.0,
|
|
"step": 12610
|
|
},
|
|
{
|
|
"entropy": 5.656122493743896,
|
|
"epoch": 1.0598193656794792,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004893329852848155,
|
|
"loss": 5.49,
|
|
"mean_token_accuracy": 0.1633685499429703,
|
|
"num_tokens": 23277741.0,
|
|
"step": 12615
|
|
},
|
|
{
|
|
"entropy": 5.695137357711792,
|
|
"epoch": 1.060239445494644,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004893238720387555,
|
|
"loss": 5.4876,
|
|
"mean_token_accuracy": 0.16543712615966796,
|
|
"num_tokens": 23286982.0,
|
|
"step": 12620
|
|
},
|
|
{
|
|
"entropy": 5.719235515594482,
|
|
"epoch": 1.0606595253098088,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004893147549960584,
|
|
"loss": 5.4086,
|
|
"mean_token_accuracy": 0.1632736824452877,
|
|
"num_tokens": 23296902.0,
|
|
"step": 12625
|
|
},
|
|
{
|
|
"entropy": 5.667002820968628,
|
|
"epoch": 1.0610796051249738,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004893056341568857,
|
|
"loss": 5.4446,
|
|
"mean_token_accuracy": 0.17050562798976898,
|
|
"num_tokens": 23305443.0,
|
|
"step": 12630
|
|
},
|
|
{
|
|
"entropy": 5.652026033401489,
|
|
"epoch": 1.0614996849401386,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004892965095213992,
|
|
"loss": 5.4055,
|
|
"mean_token_accuracy": 0.16462423503398896,
|
|
"num_tokens": 23315420.0,
|
|
"step": 12635
|
|
},
|
|
{
|
|
"entropy": 5.749802541732788,
|
|
"epoch": 1.0619197647553036,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004892873810897604,
|
|
"loss": 5.52,
|
|
"mean_token_accuracy": 0.15769467800855635,
|
|
"num_tokens": 23324540.0,
|
|
"step": 12640
|
|
},
|
|
{
|
|
"entropy": 5.699535608291626,
|
|
"epoch": 1.0623398445704684,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004892782488621308,
|
|
"loss": 5.4591,
|
|
"mean_token_accuracy": 0.16499506682157516,
|
|
"num_tokens": 23334282.0,
|
|
"step": 12645
|
|
},
|
|
{
|
|
"entropy": 5.66744704246521,
|
|
"epoch": 1.0627599243856332,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004892691128386725,
|
|
"loss": 5.4481,
|
|
"mean_token_accuracy": 0.16394151598215104,
|
|
"num_tokens": 23342836.0,
|
|
"step": 12650
|
|
},
|
|
{
|
|
"entropy": 5.73470139503479,
|
|
"epoch": 1.0631800042007982,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004892599730195471,
|
|
"loss": 5.4398,
|
|
"mean_token_accuracy": 0.16658534705638886,
|
|
"num_tokens": 23351863.0,
|
|
"step": 12655
|
|
},
|
|
{
|
|
"entropy": 5.849531030654907,
|
|
"epoch": 1.063600084015963,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004892508294049167,
|
|
"loss": 5.587,
|
|
"mean_token_accuracy": 0.16241029053926467,
|
|
"num_tokens": 23361788.0,
|
|
"step": 12660
|
|
},
|
|
{
|
|
"entropy": 5.734504842758179,
|
|
"epoch": 1.064020163831128,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004892416819949431,
|
|
"loss": 5.417,
|
|
"mean_token_accuracy": 0.1586165800690651,
|
|
"num_tokens": 23370175.0,
|
|
"step": 12665
|
|
},
|
|
{
|
|
"entropy": 5.662772369384766,
|
|
"epoch": 1.0644402436462927,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004892325307897886,
|
|
"loss": 5.4679,
|
|
"mean_token_accuracy": 0.16151935011148452,
|
|
"num_tokens": 23378835.0,
|
|
"step": 12670
|
|
},
|
|
{
|
|
"entropy": 5.69964337348938,
|
|
"epoch": 1.0648603234614578,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004892233757896149,
|
|
"loss": 5.4882,
|
|
"mean_token_accuracy": 0.16086159497499466,
|
|
"num_tokens": 23389390.0,
|
|
"step": 12675
|
|
},
|
|
{
|
|
"entropy": 5.747312068939209,
|
|
"epoch": 1.0652804032766225,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.0004892142169945845,
|
|
"loss": 5.4696,
|
|
"mean_token_accuracy": 0.16162890940904617,
|
|
"num_tokens": 23398802.0,
|
|
"step": 12680
|
|
},
|
|
{
|
|
"entropy": 5.651406192779541,
|
|
"epoch": 1.0657004830917876,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004892050544048596,
|
|
"loss": 5.4437,
|
|
"mean_token_accuracy": 0.1581580176949501,
|
|
"num_tokens": 23407731.0,
|
|
"step": 12685
|
|
},
|
|
{
|
|
"entropy": 5.678383111953735,
|
|
"epoch": 1.0661205629069523,
|
|
"grad_norm": 2.5625,
|
|
"learning_rate": 0.0004891958880206024,
|
|
"loss": 5.4943,
|
|
"mean_token_accuracy": 0.1613358035683632,
|
|
"num_tokens": 23417046.0,
|
|
"step": 12690
|
|
},
|
|
{
|
|
"entropy": 5.715891933441162,
|
|
"epoch": 1.0665406427221171,
|
|
"grad_norm": 3.09375,
|
|
"learning_rate": 0.0004891867178419753,
|
|
"loss": 5.483,
|
|
"mean_token_accuracy": 0.16457503885030747,
|
|
"num_tokens": 23426107.0,
|
|
"step": 12695
|
|
},
|
|
{
|
|
"entropy": 5.764684009552002,
|
|
"epoch": 1.0669607225372821,
|
|
"grad_norm": 2.46875,
|
|
"learning_rate": 0.0004891775438691408,
|
|
"loss": 5.529,
|
|
"mean_token_accuracy": 0.1606599673628807,
|
|
"num_tokens": 23435523.0,
|
|
"step": 12700
|
|
},
|
|
{
|
|
"entropy": 5.7297052383422855,
|
|
"epoch": 1.067380802352447,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.0004891683661022615,
|
|
"loss": 5.4656,
|
|
"mean_token_accuracy": 0.16791018098592758,
|
|
"num_tokens": 23444185.0,
|
|
"step": 12705
|
|
},
|
|
{
|
|
"entropy": 5.854583692550659,
|
|
"epoch": 1.067800882167612,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004891591845414997,
|
|
"loss": 5.6673,
|
|
"mean_token_accuracy": 0.14531354904174804,
|
|
"num_tokens": 23454100.0,
|
|
"step": 12710
|
|
},
|
|
{
|
|
"entropy": 5.826853799819946,
|
|
"epoch": 1.0682209619827767,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004891499991870184,
|
|
"loss": 5.551,
|
|
"mean_token_accuracy": 0.1544210582971573,
|
|
"num_tokens": 23463415.0,
|
|
"step": 12715
|
|
},
|
|
{
|
|
"entropy": 5.737698650360107,
|
|
"epoch": 1.0686410417979415,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048914081003898,
|
|
"loss": 5.4489,
|
|
"mean_token_accuracy": 0.16085606068372726,
|
|
"num_tokens": 23471515.0,
|
|
"step": 12720
|
|
},
|
|
{
|
|
"entropy": 5.716708326339722,
|
|
"epoch": 1.0690611216131065,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.0004891316170975475,
|
|
"loss": 5.519,
|
|
"mean_token_accuracy": 0.15879474133253096,
|
|
"num_tokens": 23481696.0,
|
|
"step": 12725
|
|
},
|
|
{
|
|
"entropy": 5.769877147674561,
|
|
"epoch": 1.0694812014282713,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004891224203628836,
|
|
"loss": 5.4707,
|
|
"mean_token_accuracy": 0.16245131343603134,
|
|
"num_tokens": 23490714.0,
|
|
"step": 12730
|
|
},
|
|
{
|
|
"entropy": 5.664735269546509,
|
|
"epoch": 1.0699012812434363,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004891132198351514,
|
|
"loss": 5.4382,
|
|
"mean_token_accuracy": 0.16582657247781754,
|
|
"num_tokens": 23500368.0,
|
|
"step": 12735
|
|
},
|
|
{
|
|
"entropy": 5.5692836284637455,
|
|
"epoch": 1.070321361058601,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004891040155145137,
|
|
"loss": 5.4033,
|
|
"mean_token_accuracy": 0.16731729805469514,
|
|
"num_tokens": 23508857.0,
|
|
"step": 12740
|
|
},
|
|
{
|
|
"entropy": 5.614154720306397,
|
|
"epoch": 1.070741440873766,
|
|
"grad_norm": 2.3125,
|
|
"learning_rate": 0.0004890948074011335,
|
|
"loss": 5.3763,
|
|
"mean_token_accuracy": 0.17136704474687575,
|
|
"num_tokens": 23518128.0,
|
|
"step": 12745
|
|
},
|
|
{
|
|
"entropy": 5.760420560836792,
|
|
"epoch": 1.071161520688931,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004890855954951741,
|
|
"loss": 5.484,
|
|
"mean_token_accuracy": 0.16657705008983612,
|
|
"num_tokens": 23527292.0,
|
|
"step": 12750
|
|
},
|
|
{
|
|
"entropy": 5.741787719726562,
|
|
"epoch": 1.0715816005040957,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004890763797967987,
|
|
"loss": 5.4957,
|
|
"mean_token_accuracy": 0.16227886378765105,
|
|
"num_tokens": 23535694.0,
|
|
"step": 12755
|
|
},
|
|
{
|
|
"entropy": 5.71260814666748,
|
|
"epoch": 1.0720016803192607,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.0004890671603061704,
|
|
"loss": 5.4826,
|
|
"mean_token_accuracy": 0.1630891129374504,
|
|
"num_tokens": 23544766.0,
|
|
"step": 12760
|
|
},
|
|
{
|
|
"entropy": 5.747214603424072,
|
|
"epoch": 1.0724217601344255,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.0004890579370234526,
|
|
"loss": 5.4528,
|
|
"mean_token_accuracy": 0.17010122984647752,
|
|
"num_tokens": 23554037.0,
|
|
"step": 12765
|
|
},
|
|
{
|
|
"entropy": 5.735277700424194,
|
|
"epoch": 1.0728418399495905,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004890487099488086,
|
|
"loss": 5.5074,
|
|
"mean_token_accuracy": 0.16082375198602678,
|
|
"num_tokens": 23562282.0,
|
|
"step": 12770
|
|
},
|
|
{
|
|
"entropy": 5.7627918243408205,
|
|
"epoch": 1.0732619197647553,
|
|
"grad_norm": 2.03125,
|
|
"learning_rate": 0.000489039479082402,
|
|
"loss": 5.5698,
|
|
"mean_token_accuracy": 0.15416947677731513,
|
|
"num_tokens": 23571955.0,
|
|
"step": 12775
|
|
},
|
|
{
|
|
"entropy": 5.707983779907226,
|
|
"epoch": 1.0736819995799203,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004890302444243962,
|
|
"loss": 5.4593,
|
|
"mean_token_accuracy": 0.15817068070173262,
|
|
"num_tokens": 23580996.0,
|
|
"step": 12780
|
|
},
|
|
{
|
|
"entropy": 5.742614698410034,
|
|
"epoch": 1.074102079395085,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004890210059749549,
|
|
"loss": 5.5578,
|
|
"mean_token_accuracy": 0.15325147658586502,
|
|
"num_tokens": 23589618.0,
|
|
"step": 12785
|
|
},
|
|
{
|
|
"entropy": 5.707106637954712,
|
|
"epoch": 1.0745221592102498,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004890117637342416,
|
|
"loss": 5.3992,
|
|
"mean_token_accuracy": 0.15842635929584503,
|
|
"num_tokens": 23599574.0,
|
|
"step": 12790
|
|
},
|
|
{
|
|
"entropy": 5.699951601028443,
|
|
"epoch": 1.0749422390254149,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004890025177024202,
|
|
"loss": 5.4576,
|
|
"mean_token_accuracy": 0.15711033195257187,
|
|
"num_tokens": 23609205.0,
|
|
"step": 12795
|
|
},
|
|
{
|
|
"entropy": 5.704383707046508,
|
|
"epoch": 1.0753623188405796,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.0004889932678796543,
|
|
"loss": 5.477,
|
|
"mean_token_accuracy": 0.1553438723087311,
|
|
"num_tokens": 23617554.0,
|
|
"step": 12800
|
|
},
|
|
{
|
|
"entropy": 5.7429163455963135,
|
|
"epoch": 1.0757823986557447,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.0004889840142661078,
|
|
"loss": 5.5275,
|
|
"mean_token_accuracy": 0.1620032712817192,
|
|
"num_tokens": 23626757.0,
|
|
"step": 12805
|
|
},
|
|
{
|
|
"entropy": 5.784508323669433,
|
|
"epoch": 1.0762024784709094,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004889747568619447,
|
|
"loss": 5.4918,
|
|
"mean_token_accuracy": 0.16077843606472014,
|
|
"num_tokens": 23636111.0,
|
|
"step": 12810
|
|
},
|
|
{
|
|
"entropy": 5.714294719696045,
|
|
"epoch": 1.0766225582860744,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004889654956673291,
|
|
"loss": 5.4738,
|
|
"mean_token_accuracy": 0.1631556496024132,
|
|
"num_tokens": 23644579.0,
|
|
"step": 12815
|
|
},
|
|
{
|
|
"entropy": 5.718669843673706,
|
|
"epoch": 1.0770426381012392,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.0004889562306824248,
|
|
"loss": 5.3889,
|
|
"mean_token_accuracy": 0.1642576903104782,
|
|
"num_tokens": 23653263.0,
|
|
"step": 12820
|
|
},
|
|
{
|
|
"entropy": 5.622420167922973,
|
|
"epoch": 1.077462717916404,
|
|
"grad_norm": 2.734375,
|
|
"learning_rate": 0.000488946961907396,
|
|
"loss": 5.3746,
|
|
"mean_token_accuracy": 0.17475783824920654,
|
|
"num_tokens": 23662529.0,
|
|
"step": 12825
|
|
},
|
|
{
|
|
"entropy": 5.601463699340821,
|
|
"epoch": 1.077882797731569,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004889376893424071,
|
|
"loss": 5.4116,
|
|
"mean_token_accuracy": 0.16859746128320693,
|
|
"num_tokens": 23671491.0,
|
|
"step": 12830
|
|
},
|
|
{
|
|
"entropy": 5.712467527389526,
|
|
"epoch": 1.0783028775467338,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004889284129876221,
|
|
"loss": 5.3964,
|
|
"mean_token_accuracy": 0.1634058475494385,
|
|
"num_tokens": 23680121.0,
|
|
"step": 12835
|
|
},
|
|
{
|
|
"entropy": 5.710364437103271,
|
|
"epoch": 1.0787229573618988,
|
|
"grad_norm": 2.34375,
|
|
"learning_rate": 0.0004889191328432054,
|
|
"loss": 5.4514,
|
|
"mean_token_accuracy": 0.16312524527311326,
|
|
"num_tokens": 23689008.0,
|
|
"step": 12840
|
|
},
|
|
{
|
|
"entropy": 5.679463958740234,
|
|
"epoch": 1.0791430371770636,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004889098489093215,
|
|
"loss": 5.4921,
|
|
"mean_token_accuracy": 0.15698808282613755,
|
|
"num_tokens": 23698551.0,
|
|
"step": 12845
|
|
},
|
|
{
|
|
"entropy": 5.807032060623169,
|
|
"epoch": 1.0795631169922286,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004889005611861347,
|
|
"loss": 5.653,
|
|
"mean_token_accuracy": 0.15691937357187272,
|
|
"num_tokens": 23707438.0,
|
|
"step": 12850
|
|
},
|
|
{
|
|
"entropy": 5.740315341949463,
|
|
"epoch": 1.0799831968073934,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004888912696738096,
|
|
"loss": 5.4873,
|
|
"mean_token_accuracy": 0.1622299484908581,
|
|
"num_tokens": 23715822.0,
|
|
"step": 12855
|
|
},
|
|
{
|
|
"entropy": 5.710565185546875,
|
|
"epoch": 1.0804032766225582,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004888819743725108,
|
|
"loss": 5.5124,
|
|
"mean_token_accuracy": 0.1617675766348839,
|
|
"num_tokens": 23725426.0,
|
|
"step": 12860
|
|
},
|
|
{
|
|
"entropy": 5.759817123413086,
|
|
"epoch": 1.0808233564377232,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.000488872675282403,
|
|
"loss": 5.5061,
|
|
"mean_token_accuracy": 0.15907542854547502,
|
|
"num_tokens": 23735092.0,
|
|
"step": 12865
|
|
},
|
|
{
|
|
"entropy": 5.7521524906158445,
|
|
"epoch": 1.081243436252888,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004888633724036509,
|
|
"loss": 5.483,
|
|
"mean_token_accuracy": 0.1602173462510109,
|
|
"num_tokens": 23744255.0,
|
|
"step": 12870
|
|
},
|
|
{
|
|
"entropy": 5.685547685623169,
|
|
"epoch": 1.081663516068053,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004888540657364192,
|
|
"loss": 5.3441,
|
|
"mean_token_accuracy": 0.16899178177118301,
|
|
"num_tokens": 23752978.0,
|
|
"step": 12875
|
|
},
|
|
{
|
|
"entropy": 5.691501188278198,
|
|
"epoch": 1.0820835958832178,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004888447552808729,
|
|
"loss": 5.4208,
|
|
"mean_token_accuracy": 0.16169283539056778,
|
|
"num_tokens": 23761051.0,
|
|
"step": 12880
|
|
},
|
|
{
|
|
"entropy": 5.744138574600219,
|
|
"epoch": 1.0825036756983828,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004888354410371768,
|
|
"loss": 5.5421,
|
|
"mean_token_accuracy": 0.15464980006217957,
|
|
"num_tokens": 23770818.0,
|
|
"step": 12885
|
|
},
|
|
{
|
|
"entropy": 5.808643198013305,
|
|
"epoch": 1.0829237555135476,
|
|
"grad_norm": 1.9453125,
|
|
"learning_rate": 0.000488826123005496,
|
|
"loss": 5.551,
|
|
"mean_token_accuracy": 0.16078845858573915,
|
|
"num_tokens": 23780597.0,
|
|
"step": 12890
|
|
},
|
|
{
|
|
"entropy": 5.6849569320678714,
|
|
"epoch": 1.0833438353287124,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.0004888168011859957,
|
|
"loss": 5.3939,
|
|
"mean_token_accuracy": 0.16438076347112657,
|
|
"num_tokens": 23790119.0,
|
|
"step": 12895
|
|
},
|
|
{
|
|
"entropy": 5.684030294418335,
|
|
"epoch": 1.0837639151438774,
|
|
"grad_norm": 1.9609375,
|
|
"learning_rate": 0.0004888074755788407,
|
|
"loss": 5.4628,
|
|
"mean_token_accuracy": 0.16382815986871718,
|
|
"num_tokens": 23798972.0,
|
|
"step": 12900
|
|
},
|
|
{
|
|
"entropy": 5.710223627090454,
|
|
"epoch": 1.0841839949590422,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.0004887981461841963,
|
|
"loss": 5.4488,
|
|
"mean_token_accuracy": 0.16691740453243256,
|
|
"num_tokens": 23808685.0,
|
|
"step": 12905
|
|
},
|
|
{
|
|
"entropy": 5.720619106292725,
|
|
"epoch": 1.0846040747742072,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004887888130022279,
|
|
"loss": 5.4419,
|
|
"mean_token_accuracy": 0.16202156841754914,
|
|
"num_tokens": 23817721.0,
|
|
"step": 12910
|
|
},
|
|
{
|
|
"entropy": 5.633600378036499,
|
|
"epoch": 1.085024154589372,
|
|
"grad_norm": 2.25,
|
|
"learning_rate": 0.0004887794760331008,
|
|
"loss": 5.4104,
|
|
"mean_token_accuracy": 0.16871272772550583,
|
|
"num_tokens": 23826892.0,
|
|
"step": 12915
|
|
},
|
|
{
|
|
"entropy": 5.680303621292114,
|
|
"epoch": 1.085444234404537,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004887701352769804,
|
|
"loss": 5.3538,
|
|
"mean_token_accuracy": 0.17344372123479843,
|
|
"num_tokens": 23835717.0,
|
|
"step": 12920
|
|
},
|
|
{
|
|
"entropy": 5.67311372756958,
|
|
"epoch": 1.0858643142197018,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.000488760790734032,
|
|
"loss": 5.4618,
|
|
"mean_token_accuracy": 0.16426583826541902,
|
|
"num_tokens": 23845814.0,
|
|
"step": 12925
|
|
},
|
|
{
|
|
"entropy": 5.720589590072632,
|
|
"epoch": 1.0862843940348665,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004887514424044214,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.1544219210743904,
|
|
"num_tokens": 23854779.0,
|
|
"step": 12930
|
|
},
|
|
{
|
|
"entropy": 5.624525451660157,
|
|
"epoch": 1.0867044738500315,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.000488742090288314,
|
|
"loss": 5.48,
|
|
"mean_token_accuracy": 0.15788208842277526,
|
|
"num_tokens": 23863533.0,
|
|
"step": 12935
|
|
},
|
|
{
|
|
"entropy": 5.733237457275391,
|
|
"epoch": 1.0871245536651963,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.0004887327343858755,
|
|
"loss": 5.5062,
|
|
"mean_token_accuracy": 0.1594018206000328,
|
|
"num_tokens": 23872725.0,
|
|
"step": 12940
|
|
},
|
|
{
|
|
"entropy": 5.74036169052124,
|
|
"epoch": 1.0875446334803613,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004887233746972717,
|
|
"loss": 5.5062,
|
|
"mean_token_accuracy": 0.15623962581157685,
|
|
"num_tokens": 23881799.0,
|
|
"step": 12945
|
|
},
|
|
{
|
|
"entropy": 5.698176908493042,
|
|
"epoch": 1.0879647132955261,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004887140112226684,
|
|
"loss": 5.5244,
|
|
"mean_token_accuracy": 0.16199513375759125,
|
|
"num_tokens": 23890628.0,
|
|
"step": 12950
|
|
},
|
|
{
|
|
"entropy": 5.695692157745361,
|
|
"epoch": 1.088384793110691,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004887046439622314,
|
|
"loss": 5.5056,
|
|
"mean_token_accuracy": 0.1679466962814331,
|
|
"num_tokens": 23899968.0,
|
|
"step": 12955
|
|
},
|
|
{
|
|
"entropy": 5.761798763275147,
|
|
"epoch": 1.088804872925856,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004886952729161267,
|
|
"loss": 5.3557,
|
|
"mean_token_accuracy": 0.16911853849887848,
|
|
"num_tokens": 23908634.0,
|
|
"step": 12960
|
|
},
|
|
{
|
|
"entropy": 5.672391033172607,
|
|
"epoch": 1.0892249527410207,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004886858980845202,
|
|
"loss": 5.5356,
|
|
"mean_token_accuracy": 0.1591207727789879,
|
|
"num_tokens": 23917925.0,
|
|
"step": 12965
|
|
},
|
|
{
|
|
"entropy": 5.618830633163452,
|
|
"epoch": 1.0896450325561857,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004886765194675782,
|
|
"loss": 5.4423,
|
|
"mean_token_accuracy": 0.16554049998521805,
|
|
"num_tokens": 23927173.0,
|
|
"step": 12970
|
|
},
|
|
{
|
|
"entropy": 5.706417846679687,
|
|
"epoch": 1.0900651123713505,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004886671370654665,
|
|
"loss": 5.4005,
|
|
"mean_token_accuracy": 0.16858862042427064,
|
|
"num_tokens": 23936258.0,
|
|
"step": 12975
|
|
},
|
|
{
|
|
"entropy": 5.690302896499634,
|
|
"epoch": 1.0904851921865155,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004886577508783516,
|
|
"loss": 5.3668,
|
|
"mean_token_accuracy": 0.16924734264612198,
|
|
"num_tokens": 23944215.0,
|
|
"step": 12980
|
|
},
|
|
{
|
|
"entropy": 5.726575565338135,
|
|
"epoch": 1.0909052720016803,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004886483609063997,
|
|
"loss": 5.4322,
|
|
"mean_token_accuracy": 0.16069534420967102,
|
|
"num_tokens": 23953151.0,
|
|
"step": 12985
|
|
},
|
|
{
|
|
"entropy": 5.620859098434448,
|
|
"epoch": 1.0913253518168453,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004886389671497769,
|
|
"loss": 5.4298,
|
|
"mean_token_accuracy": 0.17138218581676484,
|
|
"num_tokens": 23962919.0,
|
|
"step": 12990
|
|
},
|
|
{
|
|
"entropy": 5.771068859100342,
|
|
"epoch": 1.09174543163201,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.00048862956960865,
|
|
"loss": 5.4751,
|
|
"mean_token_accuracy": 0.15873296856880187,
|
|
"num_tokens": 23971900.0,
|
|
"step": 12995
|
|
},
|
|
{
|
|
"entropy": 5.6600103855133055,
|
|
"epoch": 1.0921655114471749,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004886201682831852,
|
|
"loss": 5.4397,
|
|
"mean_token_accuracy": 0.16507715582847596,
|
|
"num_tokens": 23980945.0,
|
|
"step": 13000
|
|
},
|
|
{
|
|
"entropy": 5.621271753311158,
|
|
"epoch": 1.09258559126234,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004886107631735491,
|
|
"loss": 5.3981,
|
|
"mean_token_accuracy": 0.1617731973528862,
|
|
"num_tokens": 23990460.0,
|
|
"step": 13005
|
|
},
|
|
{
|
|
"entropy": 5.712856674194336,
|
|
"epoch": 1.0930056710775047,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004886013542799083,
|
|
"loss": 5.5658,
|
|
"mean_token_accuracy": 0.1500063270330429,
|
|
"num_tokens": 23999925.0,
|
|
"step": 13010
|
|
},
|
|
{
|
|
"entropy": 5.6885334014892575,
|
|
"epoch": 1.0934257508926697,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004885919416024296,
|
|
"loss": 5.4223,
|
|
"mean_token_accuracy": 0.16228862702846528,
|
|
"num_tokens": 24009039.0,
|
|
"step": 13015
|
|
},
|
|
{
|
|
"entropy": 5.796774768829346,
|
|
"epoch": 1.0938458307078345,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.0004885825251412796,
|
|
"loss": 5.4635,
|
|
"mean_token_accuracy": 0.16517345756292343,
|
|
"num_tokens": 24017725.0,
|
|
"step": 13020
|
|
},
|
|
{
|
|
"entropy": 5.7257390975952145,
|
|
"epoch": 1.0942659105229993,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004885731048966252,
|
|
"loss": 5.4809,
|
|
"mean_token_accuracy": 0.1577977254986763,
|
|
"num_tokens": 24027158.0,
|
|
"step": 13025
|
|
},
|
|
{
|
|
"entropy": 5.62857027053833,
|
|
"epoch": 1.0946859903381643,
|
|
"grad_norm": 2.703125,
|
|
"learning_rate": 0.0004885636808686331,
|
|
"loss": 5.5101,
|
|
"mean_token_accuracy": 0.16572368741035462,
|
|
"num_tokens": 24037224.0,
|
|
"step": 13030
|
|
},
|
|
{
|
|
"entropy": 5.723979425430298,
|
|
"epoch": 1.095106070153329,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004885542530574705,
|
|
"loss": 5.486,
|
|
"mean_token_accuracy": 0.16158615350723265,
|
|
"num_tokens": 24046097.0,
|
|
"step": 13035
|
|
},
|
|
{
|
|
"entropy": 5.68514404296875,
|
|
"epoch": 1.095526149968494,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004885448214633042,
|
|
"loss": 5.3743,
|
|
"mean_token_accuracy": 0.16373368501663207,
|
|
"num_tokens": 24055270.0,
|
|
"step": 13040
|
|
},
|
|
{
|
|
"entropy": 5.738873386383057,
|
|
"epoch": 1.0959462297836589,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004885353860863013,
|
|
"loss": 5.5534,
|
|
"mean_token_accuracy": 0.15277508795261383,
|
|
"num_tokens": 24064995.0,
|
|
"step": 13045
|
|
},
|
|
{
|
|
"entropy": 5.781196355819702,
|
|
"epoch": 1.0963663095988239,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.000488525946926629,
|
|
"loss": 5.5763,
|
|
"mean_token_accuracy": 0.1572740152478218,
|
|
"num_tokens": 24075523.0,
|
|
"step": 13050
|
|
},
|
|
{
|
|
"entropy": 5.707388401031494,
|
|
"epoch": 1.0967863894139886,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004885165039844545,
|
|
"loss": 5.4404,
|
|
"mean_token_accuracy": 0.1659671738743782,
|
|
"num_tokens": 24084933.0,
|
|
"step": 13055
|
|
},
|
|
{
|
|
"entropy": 5.674459743499756,
|
|
"epoch": 1.0972064692291534,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004885070572599452,
|
|
"loss": 5.4905,
|
|
"mean_token_accuracy": 0.15537894517183304,
|
|
"num_tokens": 24093964.0,
|
|
"step": 13060
|
|
},
|
|
{
|
|
"entropy": 5.693677806854248,
|
|
"epoch": 1.0976265490443184,
|
|
"grad_norm": 1.84375,
|
|
"learning_rate": 0.0004884976067532681,
|
|
"loss": 5.4426,
|
|
"mean_token_accuracy": 0.1547652930021286,
|
|
"num_tokens": 24103951.0,
|
|
"step": 13065
|
|
},
|
|
{
|
|
"entropy": 5.671582841873169,
|
|
"epoch": 1.0980466288594832,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.000488488152464591,
|
|
"loss": 5.5455,
|
|
"mean_token_accuracy": 0.15263120234012603,
|
|
"num_tokens": 24113392.0,
|
|
"step": 13070
|
|
},
|
|
{
|
|
"entropy": 5.696406650543213,
|
|
"epoch": 1.0984667086746482,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004884786943940812,
|
|
"loss": 5.399,
|
|
"mean_token_accuracy": 0.16182387322187425,
|
|
"num_tokens": 24123165.0,
|
|
"step": 13075
|
|
},
|
|
{
|
|
"entropy": 5.67077898979187,
|
|
"epoch": 1.098886788489813,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004884692325419063,
|
|
"loss": 5.4597,
|
|
"mean_token_accuracy": 0.161738720536232,
|
|
"num_tokens": 24132176.0,
|
|
"step": 13080
|
|
},
|
|
{
|
|
"entropy": 5.703751039505005,
|
|
"epoch": 1.099306868304978,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004884597669082336,
|
|
"loss": 5.5231,
|
|
"mean_token_accuracy": 0.1586092695593834,
|
|
"num_tokens": 24141737.0,
|
|
"step": 13085
|
|
},
|
|
{
|
|
"entropy": 5.698824644088745,
|
|
"epoch": 1.0997269481201428,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004884502974932313,
|
|
"loss": 5.4568,
|
|
"mean_token_accuracy": 0.1605760633945465,
|
|
"num_tokens": 24150477.0,
|
|
"step": 13090
|
|
},
|
|
{
|
|
"entropy": 5.7847676277160645,
|
|
"epoch": 1.1001470279353076,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004884408242970668,
|
|
"loss": 5.5412,
|
|
"mean_token_accuracy": 0.1626332238316536,
|
|
"num_tokens": 24158739.0,
|
|
"step": 13095
|
|
},
|
|
{
|
|
"entropy": 5.6294517993927,
|
|
"epoch": 1.1005671077504726,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004884313473199081,
|
|
"loss": 5.3848,
|
|
"mean_token_accuracy": 0.16554880887269974,
|
|
"num_tokens": 24167511.0,
|
|
"step": 13100
|
|
},
|
|
{
|
|
"entropy": 5.6367988109588625,
|
|
"epoch": 1.1009871875656374,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004884218665619229,
|
|
"loss": 5.3834,
|
|
"mean_token_accuracy": 0.1640613242983818,
|
|
"num_tokens": 24176413.0,
|
|
"step": 13105
|
|
},
|
|
{
|
|
"entropy": 5.660191440582276,
|
|
"epoch": 1.1014072673808024,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004884123820232792,
|
|
"loss": 5.3493,
|
|
"mean_token_accuracy": 0.17108992785215377,
|
|
"num_tokens": 24185135.0,
|
|
"step": 13110
|
|
},
|
|
{
|
|
"entropy": 5.679283857345581,
|
|
"epoch": 1.1018273471959672,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004884028937041451,
|
|
"loss": 5.4342,
|
|
"mean_token_accuracy": 0.16592397689819335,
|
|
"num_tokens": 24193273.0,
|
|
"step": 13115
|
|
},
|
|
{
|
|
"entropy": 5.7048530101776125,
|
|
"epoch": 1.1022474270111322,
|
|
"grad_norm": 2.4375,
|
|
"learning_rate": 0.0004883934016046886,
|
|
"loss": 5.5063,
|
|
"mean_token_accuracy": 0.15767362713813782,
|
|
"num_tokens": 24202509.0,
|
|
"step": 13120
|
|
},
|
|
{
|
|
"entropy": 5.723241710662842,
|
|
"epoch": 1.102667506826297,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000488383905725078,
|
|
"loss": 5.4921,
|
|
"mean_token_accuracy": 0.15816541612148285,
|
|
"num_tokens": 24212644.0,
|
|
"step": 13125
|
|
},
|
|
{
|
|
"entropy": 5.697927713394165,
|
|
"epoch": 1.1030875866414618,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.0004883744060654811,
|
|
"loss": 5.3794,
|
|
"mean_token_accuracy": 0.16595268547534942,
|
|
"num_tokens": 24221838.0,
|
|
"step": 13130
|
|
},
|
|
{
|
|
"entropy": 5.6857527732849125,
|
|
"epoch": 1.1035076664566268,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.0004883649026260667,
|
|
"loss": 5.4619,
|
|
"mean_token_accuracy": 0.16348368525505066,
|
|
"num_tokens": 24230987.0,
|
|
"step": 13135
|
|
},
|
|
{
|
|
"entropy": 5.650682592391968,
|
|
"epoch": 1.1039277462717916,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004883553954070028,
|
|
"loss": 5.4118,
|
|
"mean_token_accuracy": 0.1683964341878891,
|
|
"num_tokens": 24240523.0,
|
|
"step": 13140
|
|
},
|
|
{
|
|
"entropy": 5.722428798675537,
|
|
"epoch": 1.1043478260869566,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.000488345884408458,
|
|
"loss": 5.5057,
|
|
"mean_token_accuracy": 0.16416788548231126,
|
|
"num_tokens": 24249799.0,
|
|
"step": 13145
|
|
},
|
|
{
|
|
"entropy": 5.680908250808716,
|
|
"epoch": 1.1047679059021214,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004883363696306007,
|
|
"loss": 5.454,
|
|
"mean_token_accuracy": 0.16249977350234984,
|
|
"num_tokens": 24259361.0,
|
|
"step": 13150
|
|
},
|
|
{
|
|
"entropy": 5.735293912887573,
|
|
"epoch": 1.1051879857172864,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004883268510735995,
|
|
"loss": 5.4144,
|
|
"mean_token_accuracy": 0.16268240362405778,
|
|
"num_tokens": 24268010.0,
|
|
"step": 13155
|
|
},
|
|
{
|
|
"entropy": 5.6632569313049315,
|
|
"epoch": 1.1056080655324512,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004883173287376229,
|
|
"loss": 5.4696,
|
|
"mean_token_accuracy": 0.15777850821614264,
|
|
"num_tokens": 24277416.0,
|
|
"step": 13160
|
|
},
|
|
{
|
|
"entropy": 5.762572288513184,
|
|
"epoch": 1.106028145347616,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004883078026228397,
|
|
"loss": 5.5442,
|
|
"mean_token_accuracy": 0.15905994772911072,
|
|
"num_tokens": 24286185.0,
|
|
"step": 13165
|
|
},
|
|
{
|
|
"entropy": 5.7264659881591795,
|
|
"epoch": 1.106448225162781,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004882982727294187,
|
|
"loss": 5.4127,
|
|
"mean_token_accuracy": 0.16001077443361283,
|
|
"num_tokens": 24295382.0,
|
|
"step": 13170
|
|
},
|
|
{
|
|
"entropy": 5.637160968780518,
|
|
"epoch": 1.1068683049779457,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004882887390575284,
|
|
"loss": 5.4253,
|
|
"mean_token_accuracy": 0.16418798714876176,
|
|
"num_tokens": 24305197.0,
|
|
"step": 13175
|
|
},
|
|
{
|
|
"entropy": 5.688330411911011,
|
|
"epoch": 1.1072883847931108,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004882792016073381,
|
|
"loss": 5.5197,
|
|
"mean_token_accuracy": 0.15734292566776276,
|
|
"num_tokens": 24314149.0,
|
|
"step": 13180
|
|
},
|
|
{
|
|
"entropy": 5.752083873748779,
|
|
"epoch": 1.1077084646082755,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00048826966037901655,
|
|
"loss": 5.452,
|
|
"mean_token_accuracy": 0.16154199242591857,
|
|
"num_tokens": 24323737.0,
|
|
"step": 13185
|
|
},
|
|
{
|
|
"entropy": 5.683094501495361,
|
|
"epoch": 1.1081285444234406,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048826011537273276,
|
|
"loss": 5.4217,
|
|
"mean_token_accuracy": 0.164495849609375,
|
|
"num_tokens": 24332853.0,
|
|
"step": 13190
|
|
},
|
|
{
|
|
"entropy": 5.667849111557007,
|
|
"epoch": 1.1085486242386053,
|
|
"grad_norm": 2.359375,
|
|
"learning_rate": 0.0004882505665886558,
|
|
"loss": 5.5583,
|
|
"mean_token_accuracy": 0.15878505557775496,
|
|
"num_tokens": 24342632.0,
|
|
"step": 13195
|
|
},
|
|
{
|
|
"entropy": 5.685575389862061,
|
|
"epoch": 1.1089687040537701,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048824101402695493,
|
|
"loss": 5.3884,
|
|
"mean_token_accuracy": 0.16723015904426575,
|
|
"num_tokens": 24351659.0,
|
|
"step": 13200
|
|
},
|
|
{
|
|
"entropy": 5.627767610549927,
|
|
"epoch": 1.1093887838689351,
|
|
"grad_norm": 2.234375,
|
|
"learning_rate": 0.0004882314576877993,
|
|
"loss": 5.4213,
|
|
"mean_token_accuracy": 0.1608801230788231,
|
|
"num_tokens": 24360938.0,
|
|
"step": 13205
|
|
},
|
|
{
|
|
"entropy": 5.6773035526275635,
|
|
"epoch": 1.1098088636841,
|
|
"grad_norm": 4.28125,
|
|
"learning_rate": 0.0004882218975713581,
|
|
"loss": 5.4965,
|
|
"mean_token_accuracy": 0.16553150117397308,
|
|
"num_tokens": 24369603.0,
|
|
"step": 13210
|
|
},
|
|
{
|
|
"entropy": 5.66359806060791,
|
|
"epoch": 1.110228943499265,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004882123336778009,
|
|
"loss": 5.4316,
|
|
"mean_token_accuracy": 0.16241261065006257,
|
|
"num_tokens": 24377605.0,
|
|
"step": 13215
|
|
},
|
|
{
|
|
"entropy": 5.703319883346557,
|
|
"epoch": 1.1106490233144297,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004882027660072969,
|
|
"loss": 5.4694,
|
|
"mean_token_accuracy": 0.15938784927129745,
|
|
"num_tokens": 24386930.0,
|
|
"step": 13220
|
|
},
|
|
{
|
|
"entropy": 5.699520874023437,
|
|
"epoch": 1.1110691031295947,
|
|
"grad_norm": 3.9375,
|
|
"learning_rate": 0.0004881931945600157,
|
|
"loss": 5.4329,
|
|
"mean_token_accuracy": 0.16960556656122208,
|
|
"num_tokens": 24396473.0,
|
|
"step": 13225
|
|
},
|
|
{
|
|
"entropy": 5.74881911277771,
|
|
"epoch": 1.1114891829447595,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004881836193361269,
|
|
"loss": 5.5287,
|
|
"mean_token_accuracy": 0.16469872146844863,
|
|
"num_tokens": 24405461.0,
|
|
"step": 13230
|
|
},
|
|
{
|
|
"entropy": 5.740612363815307,
|
|
"epoch": 1.1119092627599243,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004881740403358,
|
|
"loss": 5.459,
|
|
"mean_token_accuracy": 0.1663040205836296,
|
|
"num_tokens": 24414138.0,
|
|
"step": 13235
|
|
},
|
|
{
|
|
"entropy": 5.664918518066406,
|
|
"epoch": 1.1123293425750893,
|
|
"grad_norm": 1.90625,
|
|
"learning_rate": 0.00048816445755920474,
|
|
"loss": 5.4601,
|
|
"mean_token_accuracy": 0.15880708545446395,
|
|
"num_tokens": 24423386.0,
|
|
"step": 13240
|
|
},
|
|
{
|
|
"entropy": 5.675559234619141,
|
|
"epoch": 1.112749422390254,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.0004881548710065109,
|
|
"loss": 5.4695,
|
|
"mean_token_accuracy": 0.15784148275852203,
|
|
"num_tokens": 24433637.0,
|
|
"step": 13245
|
|
},
|
|
{
|
|
"entropy": 5.733191776275635,
|
|
"epoch": 1.113169502205419,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004881452806778883,
|
|
"loss": 5.5209,
|
|
"mean_token_accuracy": 0.16069816052913666,
|
|
"num_tokens": 24443677.0,
|
|
"step": 13250
|
|
},
|
|
{
|
|
"entropy": 5.709711790084839,
|
|
"epoch": 1.113589582020584,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.00048813568657350676,
|
|
"loss": 5.4059,
|
|
"mean_token_accuracy": 0.16754952371120452,
|
|
"num_tokens": 24452317.0,
|
|
"step": 13255
|
|
},
|
|
{
|
|
"entropy": 5.711081409454346,
|
|
"epoch": 1.1140096618357487,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004881260886935363,
|
|
"loss": 5.4314,
|
|
"mean_token_accuracy": 0.1628634065389633,
|
|
"num_tokens": 24460626.0,
|
|
"step": 13260
|
|
},
|
|
{
|
|
"entropy": 5.741526031494141,
|
|
"epoch": 1.1144297416509137,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.00048811648703814693,
|
|
"loss": 5.5194,
|
|
"mean_token_accuracy": 0.15109701529145242,
|
|
"num_tokens": 24469583.0,
|
|
"step": 13265
|
|
},
|
|
{
|
|
"entropy": 5.68180022239685,
|
|
"epoch": 1.1148498214660785,
|
|
"grad_norm": 1.71875,
|
|
"learning_rate": 0.0004881068816075087,
|
|
"loss": 5.4656,
|
|
"mean_token_accuracy": 0.16133692264556884,
|
|
"num_tokens": 24478811.0,
|
|
"step": 13270
|
|
},
|
|
{
|
|
"entropy": 5.655600738525391,
|
|
"epoch": 1.1152699012812435,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00048809727240179193,
|
|
"loss": 5.4917,
|
|
"mean_token_accuracy": 0.1590549409389496,
|
|
"num_tokens": 24487818.0,
|
|
"step": 13275
|
|
},
|
|
{
|
|
"entropy": 5.6665290832519535,
|
|
"epoch": 1.1156899810964083,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004880876594211665,
|
|
"loss": 5.4686,
|
|
"mean_token_accuracy": 0.15892062336206436,
|
|
"num_tokens": 24497087.0,
|
|
"step": 13280
|
|
},
|
|
{
|
|
"entropy": 5.743152189254761,
|
|
"epoch": 1.1161100609115733,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00048807804266580304,
|
|
"loss": 5.4271,
|
|
"mean_token_accuracy": 0.1575818032026291,
|
|
"num_tokens": 24505347.0,
|
|
"step": 13285
|
|
},
|
|
{
|
|
"entropy": 5.763370513916016,
|
|
"epoch": 1.116530140726738,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004880684221358717,
|
|
"loss": 5.4393,
|
|
"mean_token_accuracy": 0.15934379994869233,
|
|
"num_tokens": 24514732.0,
|
|
"step": 13290
|
|
},
|
|
{
|
|
"entropy": 5.707498073577881,
|
|
"epoch": 1.116950220541903,
|
|
"grad_norm": 1.3515625,
|
|
"learning_rate": 0.00048805879783154305,
|
|
"loss": 5.4771,
|
|
"mean_token_accuracy": 0.16071149557828904,
|
|
"num_tokens": 24523295.0,
|
|
"step": 13295
|
|
},
|
|
{
|
|
"entropy": 5.655390310287475,
|
|
"epoch": 1.1173703003570679,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.00048804916975298744,
|
|
"loss": 5.3769,
|
|
"mean_token_accuracy": 0.16449277400970458,
|
|
"num_tokens": 24532415.0,
|
|
"step": 13300
|
|
},
|
|
{
|
|
"entropy": 5.74219388961792,
|
|
"epoch": 1.1177903801722326,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004880395379003755,
|
|
"loss": 5.5072,
|
|
"mean_token_accuracy": 0.16419040709733962,
|
|
"num_tokens": 24541856.0,
|
|
"step": 13305
|
|
},
|
|
{
|
|
"entropy": 5.645684480667114,
|
|
"epoch": 1.1182104599873977,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00048802990227387797,
|
|
"loss": 5.5095,
|
|
"mean_token_accuracy": 0.1570297934114933,
|
|
"num_tokens": 24550982.0,
|
|
"step": 13310
|
|
},
|
|
{
|
|
"entropy": 5.759794473648071,
|
|
"epoch": 1.1186305398025624,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048802026287366525,
|
|
"loss": 5.583,
|
|
"mean_token_accuracy": 0.1525949463248253,
|
|
"num_tokens": 24561176.0,
|
|
"step": 13315
|
|
},
|
|
{
|
|
"entropy": 5.707194566726685,
|
|
"epoch": 1.1190506196177274,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.00048801061969990834,
|
|
"loss": 5.4472,
|
|
"mean_token_accuracy": 0.16643868386745453,
|
|
"num_tokens": 24570741.0,
|
|
"step": 13320
|
|
},
|
|
{
|
|
"entropy": 5.665846681594848,
|
|
"epoch": 1.1194706994328922,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048800097275277795,
|
|
"loss": 5.4593,
|
|
"mean_token_accuracy": 0.16555610448122024,
|
|
"num_tokens": 24580175.0,
|
|
"step": 13325
|
|
},
|
|
{
|
|
"entropy": 5.731688928604126,
|
|
"epoch": 1.119890779248057,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.000487991322032445,
|
|
"loss": 5.4515,
|
|
"mean_token_accuracy": 0.16520103812217712,
|
|
"num_tokens": 24588754.0,
|
|
"step": 13330
|
|
},
|
|
{
|
|
"entropy": 5.82857723236084,
|
|
"epoch": 1.120310859063222,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004879816675390805,
|
|
"loss": 5.6301,
|
|
"mean_token_accuracy": 0.15790451541543007,
|
|
"num_tokens": 24599429.0,
|
|
"step": 13335
|
|
},
|
|
{
|
|
"entropy": 5.654234075546265,
|
|
"epoch": 1.1207309388783868,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048797200927285547,
|
|
"loss": 5.3745,
|
|
"mean_token_accuracy": 0.1670805871486664,
|
|
"num_tokens": 24608767.0,
|
|
"step": 13340
|
|
},
|
|
{
|
|
"entropy": 5.682362413406372,
|
|
"epoch": 1.1211510186935518,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004879623472339409,
|
|
"loss": 5.5402,
|
|
"mean_token_accuracy": 0.16218066960573196,
|
|
"num_tokens": 24618232.0,
|
|
"step": 13345
|
|
},
|
|
{
|
|
"entropy": 5.671395492553711,
|
|
"epoch": 1.1215710985087166,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.000487952681422508,
|
|
"loss": 5.4195,
|
|
"mean_token_accuracy": 0.16594525426626205,
|
|
"num_tokens": 24626986.0,
|
|
"step": 13350
|
|
},
|
|
{
|
|
"entropy": 5.57235198020935,
|
|
"epoch": 1.1219911783238816,
|
|
"grad_norm": 1.40625,
|
|
"learning_rate": 0.000487943011838728,
|
|
"loss": 5.2896,
|
|
"mean_token_accuracy": 0.17551097571849822,
|
|
"num_tokens": 24635283.0,
|
|
"step": 13355
|
|
},
|
|
{
|
|
"entropy": 5.603815412521362,
|
|
"epoch": 1.1224112581390464,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004879333384827722,
|
|
"loss": 5.4099,
|
|
"mean_token_accuracy": 0.16004293411970139,
|
|
"num_tokens": 24644451.0,
|
|
"step": 13360
|
|
},
|
|
{
|
|
"entropy": 5.793188524246216,
|
|
"epoch": 1.1228313379542114,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004879236613548119,
|
|
"loss": 5.5573,
|
|
"mean_token_accuracy": 0.15772061198949813,
|
|
"num_tokens": 24654811.0,
|
|
"step": 13365
|
|
},
|
|
{
|
|
"entropy": 5.736780405044556,
|
|
"epoch": 1.1232514177693762,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004879139804550187,
|
|
"loss": 5.4654,
|
|
"mean_token_accuracy": 0.16606503427028657,
|
|
"num_tokens": 24663712.0,
|
|
"step": 13370
|
|
},
|
|
{
|
|
"entropy": 5.724746227264404,
|
|
"epoch": 1.123671497584541,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00048790429578356387,
|
|
"loss": 5.5665,
|
|
"mean_token_accuracy": 0.15672928094863892,
|
|
"num_tokens": 24672518.0,
|
|
"step": 13375
|
|
},
|
|
{
|
|
"entropy": 5.733008527755738,
|
|
"epoch": 1.124091577399706,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048789460734061915,
|
|
"loss": 5.4963,
|
|
"mean_token_accuracy": 0.15912166833877564,
|
|
"num_tokens": 24681900.0,
|
|
"step": 13380
|
|
},
|
|
{
|
|
"entropy": 5.731820964813233,
|
|
"epoch": 1.1245116572148708,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004878849151263561,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.16634195446968078,
|
|
"num_tokens": 24691760.0,
|
|
"step": 13385
|
|
},
|
|
{
|
|
"entropy": 5.733888721466064,
|
|
"epoch": 1.1249317370300358,
|
|
"grad_norm": 1.359375,
|
|
"learning_rate": 0.0004878752191409463,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.16850126534700394,
|
|
"num_tokens": 24700742.0,
|
|
"step": 13390
|
|
},
|
|
{
|
|
"entropy": 5.593951797485351,
|
|
"epoch": 1.1253518168452006,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004878655193845616,
|
|
"loss": 5.4872,
|
|
"mean_token_accuracy": 0.1581694796681404,
|
|
"num_tokens": 24709329.0,
|
|
"step": 13395
|
|
},
|
|
{
|
|
"entropy": 5.692504644393921,
|
|
"epoch": 1.1257718966603654,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.00048785581585737394,
|
|
"loss": 5.6223,
|
|
"mean_token_accuracy": 0.15529729127883912,
|
|
"num_tokens": 24718475.0,
|
|
"step": 13400
|
|
},
|
|
{
|
|
"entropy": 5.802961778640747,
|
|
"epoch": 1.1261919764755304,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000487846108559555,
|
|
"loss": 5.4954,
|
|
"mean_token_accuracy": 0.17018510401248932,
|
|
"num_tokens": 24727817.0,
|
|
"step": 13405
|
|
},
|
|
{
|
|
"entropy": 5.656300020217896,
|
|
"epoch": 1.1266120562906952,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048783639749127694,
|
|
"loss": 5.4561,
|
|
"mean_token_accuracy": 0.1604897528886795,
|
|
"num_tokens": 24737057.0,
|
|
"step": 13410
|
|
},
|
|
{
|
|
"entropy": 5.670790195465088,
|
|
"epoch": 1.1270321361058602,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.0004878266826527116,
|
|
"loss": 5.5137,
|
|
"mean_token_accuracy": 0.15796383768320083,
|
|
"num_tokens": 24746016.0,
|
|
"step": 13415
|
|
},
|
|
{
|
|
"entropy": 5.821008825302124,
|
|
"epoch": 1.127452215921025,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048781696404403126,
|
|
"loss": 5.5078,
|
|
"mean_token_accuracy": 0.16149032413959502,
|
|
"num_tokens": 24755978.0,
|
|
"step": 13420
|
|
},
|
|
{
|
|
"entropy": 5.697250509262085,
|
|
"epoch": 1.12787229573619,
|
|
"grad_norm": 4.46875,
|
|
"learning_rate": 0.00048780724166540794,
|
|
"loss": 5.3952,
|
|
"mean_token_accuracy": 0.16163186430931092,
|
|
"num_tokens": 24765255.0,
|
|
"step": 13425
|
|
},
|
|
{
|
|
"entropy": 5.634158039093018,
|
|
"epoch": 1.1282923755513548,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004877975155170139,
|
|
"loss": 5.4805,
|
|
"mean_token_accuracy": 0.15824283584952353,
|
|
"num_tokens": 24774339.0,
|
|
"step": 13430
|
|
},
|
|
{
|
|
"entropy": 5.692260265350342,
|
|
"epoch": 1.1287124553665198,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004877877855990215,
|
|
"loss": 5.4813,
|
|
"mean_token_accuracy": 0.15508203208446503,
|
|
"num_tokens": 24783236.0,
|
|
"step": 13435
|
|
},
|
|
{
|
|
"entropy": 5.687100839614868,
|
|
"epoch": 1.1291325351816845,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.000487778051911603,
|
|
"loss": 5.3898,
|
|
"mean_token_accuracy": 0.1657722622156143,
|
|
"num_tokens": 24792168.0,
|
|
"step": 13440
|
|
},
|
|
{
|
|
"entropy": 5.77692666053772,
|
|
"epoch": 1.1295526149968493,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.0004877683144549308,
|
|
"loss": 5.5277,
|
|
"mean_token_accuracy": 0.1641865387558937,
|
|
"num_tokens": 24800843.0,
|
|
"step": 13445
|
|
},
|
|
{
|
|
"entropy": 5.650931930541992,
|
|
"epoch": 1.1299726948120143,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00048775857322917753,
|
|
"loss": 5.426,
|
|
"mean_token_accuracy": 0.16078004539012908,
|
|
"num_tokens": 24810475.0,
|
|
"step": 13450
|
|
},
|
|
{
|
|
"entropy": 5.622085809707642,
|
|
"epoch": 1.1303927746271791,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004877488282345158,
|
|
"loss": 5.4865,
|
|
"mean_token_accuracy": 0.16201561838388442,
|
|
"num_tokens": 24820486.0,
|
|
"step": 13455
|
|
},
|
|
{
|
|
"entropy": 5.775274133682251,
|
|
"epoch": 1.1308128544423441,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.000487739079471118,
|
|
"loss": 5.551,
|
|
"mean_token_accuracy": 0.16165002584457397,
|
|
"num_tokens": 24830243.0,
|
|
"step": 13460
|
|
},
|
|
{
|
|
"entropy": 5.764797496795654,
|
|
"epoch": 1.131232934257509,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.000487729326939157,
|
|
"loss": 5.4526,
|
|
"mean_token_accuracy": 0.1641850858926773,
|
|
"num_tokens": 24839090.0,
|
|
"step": 13465
|
|
},
|
|
{
|
|
"entropy": 5.6410528182983395,
|
|
"epoch": 1.1316530140726737,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.00048771957063880553,
|
|
"loss": 5.4519,
|
|
"mean_token_accuracy": 0.16257893294095993,
|
|
"num_tokens": 24847933.0,
|
|
"step": 13470
|
|
},
|
|
{
|
|
"entropy": 5.754781723022461,
|
|
"epoch": 1.1320730938878387,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004877098105702363,
|
|
"loss": 5.4659,
|
|
"mean_token_accuracy": 0.1691000446677208,
|
|
"num_tokens": 24857037.0,
|
|
"step": 13475
|
|
},
|
|
{
|
|
"entropy": 5.613031244277954,
|
|
"epoch": 1.1324931737030035,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048770004673362243,
|
|
"loss": 5.2929,
|
|
"mean_token_accuracy": 0.17315517216920853,
|
|
"num_tokens": 24866042.0,
|
|
"step": 13480
|
|
},
|
|
{
|
|
"entropy": 5.554357528686523,
|
|
"epoch": 1.1329132535181685,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048769027912913673,
|
|
"loss": 5.2553,
|
|
"mean_token_accuracy": 0.17282647341489793,
|
|
"num_tokens": 24873735.0,
|
|
"step": 13485
|
|
},
|
|
{
|
|
"entropy": 5.519758033752441,
|
|
"epoch": 1.1333333333333333,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004876805077569522,
|
|
"loss": 5.3555,
|
|
"mean_token_accuracy": 0.16498854458332063,
|
|
"num_tokens": 24882277.0,
|
|
"step": 13490
|
|
},
|
|
{
|
|
"entropy": 5.610137462615967,
|
|
"epoch": 1.133753413148498,
|
|
"grad_norm": 1.390625,
|
|
"learning_rate": 0.00048767073261724204,
|
|
"loss": 5.4597,
|
|
"mean_token_accuracy": 0.1604814663529396,
|
|
"num_tokens": 24891354.0,
|
|
"step": 13495
|
|
},
|
|
{
|
|
"entropy": 5.69881238937378,
|
|
"epoch": 1.134173492963663,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004876609537101793,
|
|
"loss": 5.4431,
|
|
"mean_token_accuracy": 0.16179838478565217,
|
|
"num_tokens": 24899887.0,
|
|
"step": 13500
|
|
},
|
|
{
|
|
"entropy": 5.7566437244415285,
|
|
"epoch": 1.1345935727788279,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004876511710359374,
|
|
"loss": 5.4648,
|
|
"mean_token_accuracy": 0.16314765214920043,
|
|
"num_tokens": 24908616.0,
|
|
"step": 13505
|
|
},
|
|
{
|
|
"entropy": 5.7111047267913815,
|
|
"epoch": 1.135013652593993,
|
|
"grad_norm": 1.953125,
|
|
"learning_rate": 0.00048764138459468935,
|
|
"loss": 5.5047,
|
|
"mean_token_accuracy": 0.1574953481554985,
|
|
"num_tokens": 24917864.0,
|
|
"step": 13510
|
|
},
|
|
{
|
|
"entropy": 5.731399297714233,
|
|
"epoch": 1.1354337324091577,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048763159438660876,
|
|
"loss": 5.527,
|
|
"mean_token_accuracy": 0.1585410252213478,
|
|
"num_tokens": 24927864.0,
|
|
"step": 13515
|
|
},
|
|
{
|
|
"entropy": 5.650017118453979,
|
|
"epoch": 1.1358538122243227,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.00048762180041186893,
|
|
"loss": 5.4189,
|
|
"mean_token_accuracy": 0.1666693225502968,
|
|
"num_tokens": 24937146.0,
|
|
"step": 13520
|
|
},
|
|
{
|
|
"entropy": 5.721314907073975,
|
|
"epoch": 1.1362738920394875,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004876120026706434,
|
|
"loss": 5.5001,
|
|
"mean_token_accuracy": 0.16370597183704377,
|
|
"num_tokens": 24945694.0,
|
|
"step": 13525
|
|
},
|
|
{
|
|
"entropy": 5.627752208709717,
|
|
"epoch": 1.1366939718546525,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004876022011631057,
|
|
"loss": 5.3983,
|
|
"mean_token_accuracy": 0.16853295415639877,
|
|
"num_tokens": 24955325.0,
|
|
"step": 13530
|
|
},
|
|
{
|
|
"entropy": 5.646707582473755,
|
|
"epoch": 1.1371140516698173,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004875923958894295,
|
|
"loss": 5.2837,
|
|
"mean_token_accuracy": 0.17064532786607742,
|
|
"num_tokens": 24964028.0,
|
|
"step": 13535
|
|
},
|
|
{
|
|
"entropy": 5.666370153427124,
|
|
"epoch": 1.137534131484982,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048758258684978846,
|
|
"loss": 5.4713,
|
|
"mean_token_accuracy": 0.16217924058437347,
|
|
"num_tokens": 24972923.0,
|
|
"step": 13540
|
|
},
|
|
{
|
|
"entropy": 5.651957702636719,
|
|
"epoch": 1.137954211300147,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048757277404435636,
|
|
"loss": 5.3706,
|
|
"mean_token_accuracy": 0.1657715320587158,
|
|
"num_tokens": 24982156.0,
|
|
"step": 13545
|
|
},
|
|
{
|
|
"entropy": 5.664628601074218,
|
|
"epoch": 1.1383742911153119,
|
|
"grad_norm": 1.5703125,
|
|
"learning_rate": 0.000487562957473307,
|
|
"loss": 5.418,
|
|
"mean_token_accuracy": 0.16670742779970169,
|
|
"num_tokens": 24991616.0,
|
|
"step": 13550
|
|
},
|
|
{
|
|
"entropy": 5.60991358757019,
|
|
"epoch": 1.1387943709304769,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004875531371368144,
|
|
"loss": 5.4697,
|
|
"mean_token_accuracy": 0.16192520707845687,
|
|
"num_tokens": 25001140.0,
|
|
"step": 13555
|
|
},
|
|
{
|
|
"entropy": 5.696591901779175,
|
|
"epoch": 1.1392144507456416,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048754331303505236,
|
|
"loss": 5.3951,
|
|
"mean_token_accuracy": 0.16725025027990342,
|
|
"num_tokens": 25010863.0,
|
|
"step": 13560
|
|
},
|
|
{
|
|
"entropy": 5.726091575622559,
|
|
"epoch": 1.1396345305608064,
|
|
"grad_norm": 1.796875,
|
|
"learning_rate": 0.00048753348516819496,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.15817077159881593,
|
|
"num_tokens": 25019770.0,
|
|
"step": 13565
|
|
},
|
|
{
|
|
"entropy": 5.7528482437133786,
|
|
"epoch": 1.1400546103759714,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004875236535364163,
|
|
"loss": 5.5333,
|
|
"mean_token_accuracy": 0.15655099377036094,
|
|
"num_tokens": 25029900.0,
|
|
"step": 13570
|
|
},
|
|
{
|
|
"entropy": 5.781103563308716,
|
|
"epoch": 1.1404746901911362,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004875138181398906,
|
|
"loss": 5.4958,
|
|
"mean_token_accuracy": 0.1637464240193367,
|
|
"num_tokens": 25039428.0,
|
|
"step": 13575
|
|
},
|
|
{
|
|
"entropy": 5.688928604125977,
|
|
"epoch": 1.1408947700063012,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.000487503978978792,
|
|
"loss": 5.4784,
|
|
"mean_token_accuracy": 0.15997918397188188,
|
|
"num_tokens": 25049145.0,
|
|
"step": 13580
|
|
},
|
|
{
|
|
"entropy": 5.699268198013305,
|
|
"epoch": 1.141314849821466,
|
|
"grad_norm": 1.34375,
|
|
"learning_rate": 0.00048749413605329487,
|
|
"loss": 5.5024,
|
|
"mean_token_accuracy": 0.1619054026901722,
|
|
"num_tokens": 25058772.0,
|
|
"step": 13585
|
|
},
|
|
{
|
|
"entropy": 5.69202241897583,
|
|
"epoch": 1.141734929636631,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048748428936357346,
|
|
"loss": 5.4063,
|
|
"mean_token_accuracy": 0.16621732711791992,
|
|
"num_tokens": 25067249.0,
|
|
"step": 13590
|
|
},
|
|
{
|
|
"entropy": 5.689562606811523,
|
|
"epoch": 1.1421550094517958,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004874744389098024,
|
|
"loss": 5.3995,
|
|
"mean_token_accuracy": 0.1586827367544174,
|
|
"num_tokens": 25076893.0,
|
|
"step": 13595
|
|
},
|
|
{
|
|
"entropy": 5.665656042098999,
|
|
"epoch": 1.1425750892669608,
|
|
"grad_norm": 1.2890625,
|
|
"learning_rate": 0.0004874645846921559,
|
|
"loss": 5.3994,
|
|
"mean_token_accuracy": 0.1694393739104271,
|
|
"num_tokens": 25086238.0,
|
|
"step": 13600
|
|
},
|
|
{
|
|
"entropy": 5.644594764709472,
|
|
"epoch": 1.1429951690821256,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048745472671080884,
|
|
"loss": 5.4037,
|
|
"mean_token_accuracy": 0.16155755817890166,
|
|
"num_tokens": 25095334.0,
|
|
"step": 13605
|
|
},
|
|
{
|
|
"entropy": 5.604833793640137,
|
|
"epoch": 1.1434152488972904,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048744486496593565,
|
|
"loss": 5.404,
|
|
"mean_token_accuracy": 0.1675198942422867,
|
|
"num_tokens": 25104136.0,
|
|
"step": 13610
|
|
},
|
|
{
|
|
"entropy": 5.678550148010254,
|
|
"epoch": 1.1438353287124554,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.000487434999457711,
|
|
"loss": 5.3981,
|
|
"mean_token_accuracy": 0.17361067086458207,
|
|
"num_tokens": 25112629.0,
|
|
"step": 13615
|
|
},
|
|
{
|
|
"entropy": 5.709067010879517,
|
|
"epoch": 1.1442554085276202,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.0004874251301863098,
|
|
"loss": 5.4307,
|
|
"mean_token_accuracy": 0.15858973413705826,
|
|
"num_tokens": 25121014.0,
|
|
"step": 13620
|
|
},
|
|
{
|
|
"entropy": 5.631903171539307,
|
|
"epoch": 1.1446754883427852,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048741525715190675,
|
|
"loss": 5.4706,
|
|
"mean_token_accuracy": 0.16206472367048264,
|
|
"num_tokens": 25130097.0,
|
|
"step": 13625
|
|
},
|
|
{
|
|
"entropy": 5.711316680908203,
|
|
"epoch": 1.14509556815795,
|
|
"grad_norm": 1.25,
|
|
"learning_rate": 0.0004874053803546769,
|
|
"loss": 5.4696,
|
|
"mean_token_accuracy": 0.16349728405475616,
|
|
"num_tokens": 25139065.0,
|
|
"step": 13630
|
|
},
|
|
{
|
|
"entropy": 5.652597141265869,
|
|
"epoch": 1.1455156479731148,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.000487395499794795,
|
|
"loss": 5.4533,
|
|
"mean_token_accuracy": 0.16656357645988465,
|
|
"num_tokens": 25148852.0,
|
|
"step": 13635
|
|
},
|
|
{
|
|
"entropy": 5.617219066619873,
|
|
"epoch": 1.1459357277882798,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004873856154724362,
|
|
"loss": 5.3506,
|
|
"mean_token_accuracy": 0.1754961669445038,
|
|
"num_tokens": 25157580.0,
|
|
"step": 13640
|
|
},
|
|
{
|
|
"entropy": 5.680072975158692,
|
|
"epoch": 1.1463558076034446,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004873757273877756,
|
|
"loss": 5.4561,
|
|
"mean_token_accuracy": 0.163452672958374,
|
|
"num_tokens": 25166243.0,
|
|
"step": 13645
|
|
},
|
|
{
|
|
"entropy": 5.710014724731446,
|
|
"epoch": 1.1467758874186096,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048736583554098836,
|
|
"loss": 5.467,
|
|
"mean_token_accuracy": 0.16175027936697006,
|
|
"num_tokens": 25174674.0,
|
|
"step": 13650
|
|
},
|
|
{
|
|
"entropy": 5.641525459289551,
|
|
"epoch": 1.1471959672337744,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048735593993224973,
|
|
"loss": 5.3815,
|
|
"mean_token_accuracy": 0.1715010792016983,
|
|
"num_tokens": 25183892.0,
|
|
"step": 13655
|
|
},
|
|
{
|
|
"entropy": 5.654358148574829,
|
|
"epoch": 1.1476160470489394,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.00048734604056173495,
|
|
"loss": 5.4056,
|
|
"mean_token_accuracy": 0.16546755135059357,
|
|
"num_tokens": 25192731.0,
|
|
"step": 13660
|
|
},
|
|
{
|
|
"entropy": 5.659243631362915,
|
|
"epoch": 1.1480361268641042,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00048733613742961933,
|
|
"loss": 5.5014,
|
|
"mean_token_accuracy": 0.1672609716653824,
|
|
"num_tokens": 25201280.0,
|
|
"step": 13665
|
|
},
|
|
{
|
|
"entropy": 5.653694009780883,
|
|
"epoch": 1.1484562066792692,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.00048732623053607846,
|
|
"loss": 5.3927,
|
|
"mean_token_accuracy": 0.16543423086404802,
|
|
"num_tokens": 25209929.0,
|
|
"step": 13670
|
|
},
|
|
{
|
|
"entropy": 5.695533609390258,
|
|
"epoch": 1.148876286494434,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.0004873163198812877,
|
|
"loss": 5.3275,
|
|
"mean_token_accuracy": 0.17144448906183243,
|
|
"num_tokens": 25218583.0,
|
|
"step": 13675
|
|
},
|
|
{
|
|
"entropy": 5.738392496109009,
|
|
"epoch": 1.1492963663095987,
|
|
"grad_norm": 2.21875,
|
|
"learning_rate": 0.0004873064054654227,
|
|
"loss": 5.5484,
|
|
"mean_token_accuracy": 0.1579354114830494,
|
|
"num_tokens": 25228949.0,
|
|
"step": 13680
|
|
},
|
|
{
|
|
"entropy": 5.683894395828247,
|
|
"epoch": 1.1497164461247638,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.00048729648728865904,
|
|
"loss": 5.3731,
|
|
"mean_token_accuracy": 0.1767183229327202,
|
|
"num_tokens": 25238603.0,
|
|
"step": 13685
|
|
},
|
|
{
|
|
"entropy": 5.71501407623291,
|
|
"epoch": 1.1501365259399285,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.00048728656535117237,
|
|
"loss": 5.4992,
|
|
"mean_token_accuracy": 0.15357865989208222,
|
|
"num_tokens": 25248265.0,
|
|
"step": 13690
|
|
},
|
|
{
|
|
"entropy": 5.643803834915161,
|
|
"epoch": 1.1505566057550936,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004872766396531386,
|
|
"loss": 5.484,
|
|
"mean_token_accuracy": 0.16071568578481674,
|
|
"num_tokens": 25258195.0,
|
|
"step": 13695
|
|
},
|
|
{
|
|
"entropy": 5.702970218658447,
|
|
"epoch": 1.1509766855702583,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048726671019473335,
|
|
"loss": 5.4422,
|
|
"mean_token_accuracy": 0.16466852724552156,
|
|
"num_tokens": 25267886.0,
|
|
"step": 13700
|
|
},
|
|
{
|
|
"entropy": 5.746435308456421,
|
|
"epoch": 1.1513967653854231,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048725677697613267,
|
|
"loss": 5.4682,
|
|
"mean_token_accuracy": 0.16264414340257644,
|
|
"num_tokens": 25277304.0,
|
|
"step": 13705
|
|
},
|
|
{
|
|
"entropy": 5.713738203048706,
|
|
"epoch": 1.1518168452005881,
|
|
"grad_norm": 2.078125,
|
|
"learning_rate": 0.0004872468399975125,
|
|
"loss": 5.4881,
|
|
"mean_token_accuracy": 0.15376320034265517,
|
|
"num_tokens": 25286771.0,
|
|
"step": 13710
|
|
},
|
|
{
|
|
"entropy": 5.780117321014404,
|
|
"epoch": 1.152236925015753,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.00048723689925904884,
|
|
"loss": 5.5313,
|
|
"mean_token_accuracy": 0.1592633917927742,
|
|
"num_tokens": 25296018.0,
|
|
"step": 13715
|
|
},
|
|
{
|
|
"entropy": 5.741491746902466,
|
|
"epoch": 1.152657004830918,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004872269547609179,
|
|
"loss": 5.4953,
|
|
"mean_token_accuracy": 0.16684821844100953,
|
|
"num_tokens": 25305737.0,
|
|
"step": 13720
|
|
},
|
|
{
|
|
"entropy": 5.593285465240479,
|
|
"epoch": 1.1530770846460827,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004872170065032956,
|
|
"loss": 5.3045,
|
|
"mean_token_accuracy": 0.16534190028905868,
|
|
"num_tokens": 25314625.0,
|
|
"step": 13725
|
|
},
|
|
{
|
|
"entropy": 5.624049663543701,
|
|
"epoch": 1.1534971644612477,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004872070544863584,
|
|
"loss": 5.4599,
|
|
"mean_token_accuracy": 0.16450536549091338,
|
|
"num_tokens": 25323453.0,
|
|
"step": 13730
|
|
},
|
|
{
|
|
"entropy": 5.670636749267578,
|
|
"epoch": 1.1539172442764125,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004871970987102824,
|
|
"loss": 5.4532,
|
|
"mean_token_accuracy": 0.16838308423757553,
|
|
"num_tokens": 25333236.0,
|
|
"step": 13735
|
|
},
|
|
{
|
|
"entropy": 5.692652988433838,
|
|
"epoch": 1.1543373240915775,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004871871391752442,
|
|
"loss": 5.3671,
|
|
"mean_token_accuracy": 0.16096194535493852,
|
|
"num_tokens": 25341993.0,
|
|
"step": 13740
|
|
},
|
|
{
|
|
"entropy": 5.680402231216431,
|
|
"epoch": 1.1547574039067423,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048717717588141993,
|
|
"loss": 5.4006,
|
|
"mean_token_accuracy": 0.1673072651028633,
|
|
"num_tokens": 25350695.0,
|
|
"step": 13745
|
|
},
|
|
{
|
|
"entropy": 5.732087278366089,
|
|
"epoch": 1.155177483721907,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004871672088289863,
|
|
"loss": 5.4491,
|
|
"mean_token_accuracy": 0.16335247606039047,
|
|
"num_tokens": 25359044.0,
|
|
"step": 13750
|
|
},
|
|
{
|
|
"entropy": 5.652139139175415,
|
|
"epoch": 1.155597563537072,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048715723801811986,
|
|
"loss": 5.4682,
|
|
"mean_token_accuracy": 0.1620037004351616,
|
|
"num_tokens": 25367959.0,
|
|
"step": 13755
|
|
},
|
|
{
|
|
"entropy": 5.666346216201783,
|
|
"epoch": 1.156017643352237,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00048714726344899716,
|
|
"loss": 5.471,
|
|
"mean_token_accuracy": 0.16407882869243623,
|
|
"num_tokens": 25376968.0,
|
|
"step": 13760
|
|
},
|
|
{
|
|
"entropy": 5.6227333545684814,
|
|
"epoch": 1.156437723167402,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004871372851217949,
|
|
"loss": 5.3543,
|
|
"mean_token_accuracy": 0.1680816724896431,
|
|
"num_tokens": 25385381.0,
|
|
"step": 13765
|
|
},
|
|
{
|
|
"entropy": 5.7028580665588375,
|
|
"epoch": 1.1568578029825667,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004871273030366899,
|
|
"loss": 5.4631,
|
|
"mean_token_accuracy": 0.1649225354194641,
|
|
"num_tokens": 25394647.0,
|
|
"step": 13770
|
|
},
|
|
{
|
|
"entropy": 5.648839139938355,
|
|
"epoch": 1.1572778827977315,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.0004871173171938589,
|
|
"loss": 5.4319,
|
|
"mean_token_accuracy": 0.17046377956867217,
|
|
"num_tokens": 25403973.0,
|
|
"step": 13775
|
|
},
|
|
{
|
|
"entropy": 5.6703736782073975,
|
|
"epoch": 1.1576979626128965,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004871073275934789,
|
|
"loss": 5.3984,
|
|
"mean_token_accuracy": 0.1691466361284256,
|
|
"num_tokens": 25412319.0,
|
|
"step": 13780
|
|
},
|
|
{
|
|
"entropy": 5.643203210830689,
|
|
"epoch": 1.1581180424280613,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00048709733423572685,
|
|
"loss": 5.4302,
|
|
"mean_token_accuracy": 0.16623932421207427,
|
|
"num_tokens": 25420558.0,
|
|
"step": 13785
|
|
},
|
|
{
|
|
"entropy": 5.591461992263794,
|
|
"epoch": 1.1585381222432263,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048708733712077973,
|
|
"loss": 5.3857,
|
|
"mean_token_accuracy": 0.16710084825754165,
|
|
"num_tokens": 25429258.0,
|
|
"step": 13790
|
|
},
|
|
{
|
|
"entropy": 5.708824634552002,
|
|
"epoch": 1.158958202058391,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004870773362488146,
|
|
"loss": 5.3492,
|
|
"mean_token_accuracy": 0.17254966199398042,
|
|
"num_tokens": 25438005.0,
|
|
"step": 13795
|
|
},
|
|
{
|
|
"entropy": 5.66907844543457,
|
|
"epoch": 1.159378281873556,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004870673316200087,
|
|
"loss": 5.3715,
|
|
"mean_token_accuracy": 0.1682361513376236,
|
|
"num_tokens": 25447120.0,
|
|
"step": 13800
|
|
},
|
|
{
|
|
"entropy": 5.595958423614502,
|
|
"epoch": 1.1597983616887209,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.0004870573232345392,
|
|
"loss": 5.3666,
|
|
"mean_token_accuracy": 0.16778069287538527,
|
|
"num_tokens": 25456216.0,
|
|
"step": 13805
|
|
},
|
|
{
|
|
"entropy": 5.823038816452026,
|
|
"epoch": 1.1602184415038856,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004870473110925834,
|
|
"loss": 5.6262,
|
|
"mean_token_accuracy": 0.1571579709649086,
|
|
"num_tokens": 25466456.0,
|
|
"step": 13810
|
|
},
|
|
{
|
|
"entropy": 5.6383460521697994,
|
|
"epoch": 1.1606385213190507,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004870372951943187,
|
|
"loss": 5.293,
|
|
"mean_token_accuracy": 0.17360034435987473,
|
|
"num_tokens": 25475217.0,
|
|
"step": 13815
|
|
},
|
|
{
|
|
"entropy": 5.673424625396729,
|
|
"epoch": 1.1610586011342154,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048702727553992243,
|
|
"loss": 5.5869,
|
|
"mean_token_accuracy": 0.15147149562835693,
|
|
"num_tokens": 25484617.0,
|
|
"step": 13820
|
|
},
|
|
{
|
|
"entropy": 5.6951432704925535,
|
|
"epoch": 1.1614786809493804,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048701725212957223,
|
|
"loss": 5.3764,
|
|
"mean_token_accuracy": 0.17048204690217972,
|
|
"num_tokens": 25493936.0,
|
|
"step": 13825
|
|
},
|
|
{
|
|
"entropy": 5.621255588531494,
|
|
"epoch": 1.1618987607645452,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.0004870072249634455,
|
|
"loss": 5.3668,
|
|
"mean_token_accuracy": 0.16855009347200395,
|
|
"num_tokens": 25502306.0,
|
|
"step": 13830
|
|
},
|
|
{
|
|
"entropy": 5.569633483886719,
|
|
"epoch": 1.1623188405797102,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.00048699719404172006,
|
|
"loss": 5.4267,
|
|
"mean_token_accuracy": 0.16455204039812088,
|
|
"num_tokens": 25511247.0,
|
|
"step": 13835
|
|
},
|
|
{
|
|
"entropy": 5.657006931304932,
|
|
"epoch": 1.162738920394875,
|
|
"grad_norm": 1.3359375,
|
|
"learning_rate": 0.00048698715936457344,
|
|
"loss": 5.4694,
|
|
"mean_token_accuracy": 0.16329195201396943,
|
|
"num_tokens": 25520482.0,
|
|
"step": 13840
|
|
},
|
|
{
|
|
"entropy": 5.640268325805664,
|
|
"epoch": 1.1631590002100398,
|
|
"grad_norm": 1.7734375,
|
|
"learning_rate": 0.00048697712093218336,
|
|
"loss": 5.3744,
|
|
"mean_token_accuracy": 0.16461358666419984,
|
|
"num_tokens": 25529854.0,
|
|
"step": 13845
|
|
},
|
|
{
|
|
"entropy": 5.623713779449463,
|
|
"epoch": 1.1635790800252048,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004869670787447279,
|
|
"loss": 5.3002,
|
|
"mean_token_accuracy": 0.16914840638637543,
|
|
"num_tokens": 25538251.0,
|
|
"step": 13850
|
|
},
|
|
{
|
|
"entropy": 5.638497924804687,
|
|
"epoch": 1.1639991598403696,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004869570328023846,
|
|
"loss": 5.3871,
|
|
"mean_token_accuracy": 0.16646116971969604,
|
|
"num_tokens": 25546889.0,
|
|
"step": 13855
|
|
},
|
|
{
|
|
"entropy": 5.594804191589356,
|
|
"epoch": 1.1644192396555346,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.00048694698310533177,
|
|
"loss": 5.4603,
|
|
"mean_token_accuracy": 0.16453583240509034,
|
|
"num_tokens": 25557040.0,
|
|
"step": 13860
|
|
},
|
|
{
|
|
"entropy": 5.753741073608398,
|
|
"epoch": 1.1648393194706994,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004869369296537472,
|
|
"loss": 5.609,
|
|
"mean_token_accuracy": 0.15437615364789964,
|
|
"num_tokens": 25565798.0,
|
|
"step": 13865
|
|
},
|
|
{
|
|
"entropy": 5.787939405441284,
|
|
"epoch": 1.1652593992858642,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.0004869268724478091,
|
|
"loss": 5.4344,
|
|
"mean_token_accuracy": 0.16476142406463623,
|
|
"num_tokens": 25575039.0,
|
|
"step": 13870
|
|
},
|
|
{
|
|
"entropy": 5.709807395935059,
|
|
"epoch": 1.1656794791010292,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048691681148769545,
|
|
"loss": 5.447,
|
|
"mean_token_accuracy": 0.16519313603639602,
|
|
"num_tokens": 25584635.0,
|
|
"step": 13875
|
|
},
|
|
{
|
|
"entropy": 5.570197010040284,
|
|
"epoch": 1.166099558916194,
|
|
"grad_norm": 2.0,
|
|
"learning_rate": 0.0004869067467735847,
|
|
"loss": 5.3732,
|
|
"mean_token_accuracy": 0.16737187206745147,
|
|
"num_tokens": 25593736.0,
|
|
"step": 13880
|
|
},
|
|
{
|
|
"entropy": 5.625331687927246,
|
|
"epoch": 1.166519638731359,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004868966783056551,
|
|
"loss": 5.3503,
|
|
"mean_token_accuracy": 0.17617221772670746,
|
|
"num_tokens": 25602685.0,
|
|
"step": 13885
|
|
},
|
|
{
|
|
"entropy": 5.6361452579498295,
|
|
"epoch": 1.1669397185465238,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.00048688660608408484,
|
|
"loss": 5.4137,
|
|
"mean_token_accuracy": 0.16371521204710007,
|
|
"num_tokens": 25610690.0,
|
|
"step": 13890
|
|
},
|
|
{
|
|
"entropy": 5.572535657882691,
|
|
"epoch": 1.1673597983616888,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048687653010905254,
|
|
"loss": 5.3114,
|
|
"mean_token_accuracy": 0.17053646743297576,
|
|
"num_tokens": 25619805.0,
|
|
"step": 13895
|
|
},
|
|
{
|
|
"entropy": 5.698605394363403,
|
|
"epoch": 1.1677798781768536,
|
|
"grad_norm": 1.8046875,
|
|
"learning_rate": 0.00048686645038073664,
|
|
"loss": 5.5349,
|
|
"mean_token_accuracy": 0.15510550141334534,
|
|
"num_tokens": 25629447.0,
|
|
"step": 13900
|
|
},
|
|
{
|
|
"entropy": 5.669313621520996,
|
|
"epoch": 1.1681999579920186,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048685636689931554,
|
|
"loss": 5.3748,
|
|
"mean_token_accuracy": 0.1702971413731575,
|
|
"num_tokens": 25638619.0,
|
|
"step": 13905
|
|
},
|
|
{
|
|
"entropy": 5.711242389678955,
|
|
"epoch": 1.1686200378071834,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048684627966496803,
|
|
"loss": 5.4614,
|
|
"mean_token_accuracy": 0.1624944031238556,
|
|
"num_tokens": 25648255.0,
|
|
"step": 13910
|
|
},
|
|
{
|
|
"entropy": 5.66675066947937,
|
|
"epoch": 1.1690401176223482,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.00048683618867787284,
|
|
"loss": 5.5036,
|
|
"mean_token_accuracy": 0.16320302784442903,
|
|
"num_tokens": 25657881.0,
|
|
"step": 13915
|
|
},
|
|
{
|
|
"entropy": 5.758802509307861,
|
|
"epoch": 1.1694601974375132,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004868260939382086,
|
|
"loss": 5.5189,
|
|
"mean_token_accuracy": 0.16214758455753325,
|
|
"num_tokens": 25666773.0,
|
|
"step": 13920
|
|
},
|
|
{
|
|
"entropy": 5.709087944030761,
|
|
"epoch": 1.169880277252678,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004868159954461542,
|
|
"loss": 5.4196,
|
|
"mean_token_accuracy": 0.1658247232437134,
|
|
"num_tokens": 25675152.0,
|
|
"step": 13925
|
|
},
|
|
{
|
|
"entropy": 5.730292272567749,
|
|
"epoch": 1.170300357067843,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048680589320188847,
|
|
"loss": 5.5228,
|
|
"mean_token_accuracy": 0.15880742371082307,
|
|
"num_tokens": 25684962.0,
|
|
"step": 13930
|
|
},
|
|
{
|
|
"entropy": 5.679729652404785,
|
|
"epoch": 1.1707204368830078,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004867957872055904,
|
|
"loss": 5.4287,
|
|
"mean_token_accuracy": 0.16511483937501908,
|
|
"num_tokens": 25693782.0,
|
|
"step": 13935
|
|
},
|
|
{
|
|
"entropy": 5.641356992721557,
|
|
"epoch": 1.1711405166981725,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048678567745743905,
|
|
"loss": 5.3885,
|
|
"mean_token_accuracy": 0.17264550924301147,
|
|
"num_tokens": 25703081.0,
|
|
"step": 13940
|
|
},
|
|
{
|
|
"entropy": 5.658506870269775,
|
|
"epoch": 1.1715605965133375,
|
|
"grad_norm": 1.4140625,
|
|
"learning_rate": 0.0004867755639576135,
|
|
"loss": 5.4104,
|
|
"mean_token_accuracy": 0.16793930828571318,
|
|
"num_tokens": 25711628.0,
|
|
"step": 13945
|
|
},
|
|
{
|
|
"entropy": 5.640485239028931,
|
|
"epoch": 1.1719806763285023,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004867654467062928,
|
|
"loss": 5.4241,
|
|
"mean_token_accuracy": 0.16928267627954482,
|
|
"num_tokens": 25720676.0,
|
|
"step": 13950
|
|
},
|
|
{
|
|
"entropy": 5.649695205688476,
|
|
"epoch": 1.1724007561436673,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048675532570365633,
|
|
"loss": 5.392,
|
|
"mean_token_accuracy": 0.16928528249263763,
|
|
"num_tokens": 25729920.0,
|
|
"step": 13955
|
|
},
|
|
{
|
|
"entropy": 5.625086498260498,
|
|
"epoch": 1.1728208359588321,
|
|
"grad_norm": 2.390625,
|
|
"learning_rate": 0.00048674520094988327,
|
|
"loss": 5.3654,
|
|
"mean_token_accuracy": 0.17307248711585999,
|
|
"num_tokens": 25739745.0,
|
|
"step": 13960
|
|
},
|
|
{
|
|
"entropy": 5.627538824081421,
|
|
"epoch": 1.1732409157739971,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048673507244515303,
|
|
"loss": 5.3779,
|
|
"mean_token_accuracy": 0.16306281685829163,
|
|
"num_tokens": 25748636.0,
|
|
"step": 13965
|
|
},
|
|
{
|
|
"entropy": 5.715080642700196,
|
|
"epoch": 1.173660995589162,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.000486724940189645,
|
|
"loss": 5.5204,
|
|
"mean_token_accuracy": 0.1618019700050354,
|
|
"num_tokens": 25758393.0,
|
|
"step": 13970
|
|
},
|
|
{
|
|
"entropy": 5.68854718208313,
|
|
"epoch": 1.174081075404327,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004867148041835386,
|
|
"loss": 5.5155,
|
|
"mean_token_accuracy": 0.1531493306159973,
|
|
"num_tokens": 25768520.0,
|
|
"step": 13975
|
|
},
|
|
{
|
|
"entropy": 5.570581007003784,
|
|
"epoch": 1.1745011552194917,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.0004867046644270136,
|
|
"loss": 5.3033,
|
|
"mean_token_accuracy": 0.16933976262807846,
|
|
"num_tokens": 25777168.0,
|
|
"step": 13980
|
|
},
|
|
{
|
|
"entropy": 5.755339241027832,
|
|
"epoch": 1.1749212350346565,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004866945209202494,
|
|
"loss": 5.6177,
|
|
"mean_token_accuracy": 0.14852596670389176,
|
|
"num_tokens": 25787042.0,
|
|
"step": 13985
|
|
},
|
|
{
|
|
"entropy": 5.755776023864746,
|
|
"epoch": 1.1753413148498215,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.0004866843736634258,
|
|
"loss": 5.4891,
|
|
"mean_token_accuracy": 0.16482731252908706,
|
|
"num_tokens": 25796784.0,
|
|
"step": 13990
|
|
},
|
|
{
|
|
"entropy": 5.821707010269165,
|
|
"epoch": 1.1757613946649863,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.0004866742226567225,
|
|
"loss": 5.5454,
|
|
"mean_token_accuracy": 0.1584312066435814,
|
|
"num_tokens": 25806285.0,
|
|
"step": 13995
|
|
},
|
|
{
|
|
"entropy": 5.697404479980468,
|
|
"epoch": 1.1761814744801513,
|
|
"grad_norm": 1.8828125,
|
|
"learning_rate": 0.00048666406790031936,
|
|
"loss": 5.3646,
|
|
"mean_token_accuracy": 0.16209536790847778,
|
|
"num_tokens": 25814889.0,
|
|
"step": 14000
|
|
},
|
|
{
|
|
"entropy": 5.6022241592407225,
|
|
"epoch": 1.176601554295316,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004866539093943962,
|
|
"loss": 5.4378,
|
|
"mean_token_accuracy": 0.1663196012377739,
|
|
"num_tokens": 25824551.0,
|
|
"step": 14005
|
|
},
|
|
{
|
|
"entropy": 5.756335592269897,
|
|
"epoch": 1.1770216341104809,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048664374713913304,
|
|
"loss": 5.5054,
|
|
"mean_token_accuracy": 0.15789050087332726,
|
|
"num_tokens": 25834482.0,
|
|
"step": 14010
|
|
},
|
|
{
|
|
"entropy": 5.777653217315674,
|
|
"epoch": 1.177441713925646,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004866335811347099,
|
|
"loss": 5.5015,
|
|
"mean_token_accuracy": 0.15598200112581254,
|
|
"num_tokens": 25843274.0,
|
|
"step": 14015
|
|
},
|
|
{
|
|
"entropy": 5.761362743377686,
|
|
"epoch": 1.1778617937408107,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.00048662341138130683,
|
|
"loss": 5.4877,
|
|
"mean_token_accuracy": 0.158286252617836,
|
|
"num_tokens": 25852482.0,
|
|
"step": 14020
|
|
},
|
|
{
|
|
"entropy": 5.748376035690308,
|
|
"epoch": 1.1782818735559757,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048661323787910405,
|
|
"loss": 5.4725,
|
|
"mean_token_accuracy": 0.1600512832403183,
|
|
"num_tokens": 25862657.0,
|
|
"step": 14025
|
|
},
|
|
{
|
|
"entropy": 5.64421010017395,
|
|
"epoch": 1.1787019533711405,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004866030606282817,
|
|
"loss": 5.4185,
|
|
"mean_token_accuracy": 0.1709764003753662,
|
|
"num_tokens": 25871492.0,
|
|
"step": 14030
|
|
},
|
|
{
|
|
"entropy": 5.638793468475342,
|
|
"epoch": 1.1791220331863055,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048659287962902006,
|
|
"loss": 5.4203,
|
|
"mean_token_accuracy": 0.1675298035144806,
|
|
"num_tokens": 25880979.0,
|
|
"step": 14035
|
|
},
|
|
{
|
|
"entropy": 5.696149921417236,
|
|
"epoch": 1.1795421130014703,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00048658269488149945,
|
|
"loss": 5.4218,
|
|
"mean_token_accuracy": 0.1625008463859558,
|
|
"num_tokens": 25891060.0,
|
|
"step": 14040
|
|
},
|
|
{
|
|
"entropy": 5.761597537994385,
|
|
"epoch": 1.1799621928166353,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004865725063859005,
|
|
"loss": 5.544,
|
|
"mean_token_accuracy": 0.16607259213924408,
|
|
"num_tokens": 25900421.0,
|
|
"step": 14045
|
|
},
|
|
{
|
|
"entropy": 5.653324842453003,
|
|
"epoch": 1.1803822726318,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048656231414240345,
|
|
"loss": 5.4313,
|
|
"mean_token_accuracy": 0.1618879944086075,
|
|
"num_tokens": 25909614.0,
|
|
"step": 14050
|
|
},
|
|
{
|
|
"entropy": 5.616987228393555,
|
|
"epoch": 1.1808023524469649,
|
|
"grad_norm": 1.421875,
|
|
"learning_rate": 0.000486552118151189,
|
|
"loss": 5.4679,
|
|
"mean_token_accuracy": 0.15664371848106384,
|
|
"num_tokens": 25919324.0,
|
|
"step": 14055
|
|
},
|
|
{
|
|
"entropy": 5.717600774765015,
|
|
"epoch": 1.1812224322621299,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048654191841243763,
|
|
"loss": 5.4666,
|
|
"mean_token_accuracy": 0.16498553603887559,
|
|
"num_tokens": 25928818.0,
|
|
"step": 14060
|
|
},
|
|
{
|
|
"entropy": 5.759012937545776,
|
|
"epoch": 1.1816425120772946,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004865317149263301,
|
|
"loss": 5.5292,
|
|
"mean_token_accuracy": 0.1587470844388008,
|
|
"num_tokens": 25938148.0,
|
|
"step": 14065
|
|
},
|
|
{
|
|
"entropy": 5.616943883895874,
|
|
"epoch": 1.1820625918924597,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004865215076930473,
|
|
"loss": 5.4301,
|
|
"mean_token_accuracy": 0.1658810332417488,
|
|
"num_tokens": 25947210.0,
|
|
"step": 14070
|
|
},
|
|
{
|
|
"entropy": 5.620659303665161,
|
|
"epoch": 1.1824826717076244,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004865112967127697,
|
|
"loss": 5.4247,
|
|
"mean_token_accuracy": 0.16346285492181778,
|
|
"num_tokens": 25955949.0,
|
|
"step": 14075
|
|
},
|
|
{
|
|
"entropy": 5.575088262557983,
|
|
"epoch": 1.1829027515227892,
|
|
"grad_norm": 1.453125,
|
|
"learning_rate": 0.0004865010819856786,
|
|
"loss": 5.3599,
|
|
"mean_token_accuracy": 0.16429171711206436,
|
|
"num_tokens": 25964193.0,
|
|
"step": 14080
|
|
},
|
|
{
|
|
"entropy": 5.644189167022705,
|
|
"epoch": 1.1833228313379542,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.0004864908635119546,
|
|
"loss": 5.4325,
|
|
"mean_token_accuracy": 0.16631924957036973,
|
|
"num_tokens": 25973141.0,
|
|
"step": 14085
|
|
},
|
|
{
|
|
"entropy": 5.723185777664185,
|
|
"epoch": 1.183742911153119,
|
|
"grad_norm": 1.5078125,
|
|
"learning_rate": 0.0004864806412917788,
|
|
"loss": 5.5128,
|
|
"mean_token_accuracy": 0.16188513338565827,
|
|
"num_tokens": 25982650.0,
|
|
"step": 14090
|
|
},
|
|
{
|
|
"entropy": 5.752957391738891,
|
|
"epoch": 1.184162990968284,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004864704153253325,
|
|
"loss": 5.5041,
|
|
"mean_token_accuracy": 0.15529987812042237,
|
|
"num_tokens": 25992096.0,
|
|
"step": 14095
|
|
},
|
|
{
|
|
"entropy": 5.73392686843872,
|
|
"epoch": 1.1845830707834488,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00048646018561279665,
|
|
"loss": 5.4828,
|
|
"mean_token_accuracy": 0.16362193524837493,
|
|
"num_tokens": 26002063.0,
|
|
"step": 14100
|
|
},
|
|
{
|
|
"entropy": 5.537550973892212,
|
|
"epoch": 1.1850031505986138,
|
|
"grad_norm": 1.4765625,
|
|
"learning_rate": 0.00048644995215435245,
|
|
"loss": 5.3166,
|
|
"mean_token_accuracy": 0.17091075628995894,
|
|
"num_tokens": 26010716.0,
|
|
"step": 14105
|
|
},
|
|
{
|
|
"entropy": 5.664225101470947,
|
|
"epoch": 1.1854232304137786,
|
|
"grad_norm": 2.578125,
|
|
"learning_rate": 0.0004864397149501812,
|
|
"loss": 5.3975,
|
|
"mean_token_accuracy": 0.1652759626507759,
|
|
"num_tokens": 26019136.0,
|
|
"step": 14110
|
|
},
|
|
{
|
|
"entropy": 5.679570627212525,
|
|
"epoch": 1.1858433102289434,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048642947400046434,
|
|
"loss": 5.4205,
|
|
"mean_token_accuracy": 0.17468804866075516,
|
|
"num_tokens": 26028029.0,
|
|
"step": 14115
|
|
},
|
|
{
|
|
"entropy": 5.725770425796509,
|
|
"epoch": 1.1862633900441084,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048641922930538325,
|
|
"loss": 5.578,
|
|
"mean_token_accuracy": 0.15540485978126525,
|
|
"num_tokens": 26038025.0,
|
|
"step": 14120
|
|
},
|
|
{
|
|
"entropy": 5.6925132274627686,
|
|
"epoch": 1.1866834698592732,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004864089808651193,
|
|
"loss": 5.5527,
|
|
"mean_token_accuracy": 0.1457514375448227,
|
|
"num_tokens": 26048427.0,
|
|
"step": 14125
|
|
},
|
|
{
|
|
"entropy": 5.681820917129516,
|
|
"epoch": 1.1871035496744382,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004863987286798541,
|
|
"loss": 5.3589,
|
|
"mean_token_accuracy": 0.1656435549259186,
|
|
"num_tokens": 26057682.0,
|
|
"step": 14130
|
|
},
|
|
{
|
|
"entropy": 5.619316673278808,
|
|
"epoch": 1.187523629489603,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004863884727497693,
|
|
"loss": 5.421,
|
|
"mean_token_accuracy": 0.16528261303901673,
|
|
"num_tokens": 26066562.0,
|
|
"step": 14135
|
|
},
|
|
{
|
|
"entropy": 5.603818464279175,
|
|
"epoch": 1.187943709304768,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004863782130750466,
|
|
"loss": 5.3489,
|
|
"mean_token_accuracy": 0.16920543015003203,
|
|
"num_tokens": 26075633.0,
|
|
"step": 14140
|
|
},
|
|
{
|
|
"entropy": 5.7123847007751465,
|
|
"epoch": 1.1883637891199328,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.00048636794965586764,
|
|
"loss": 5.5142,
|
|
"mean_token_accuracy": 0.1616382658481598,
|
|
"num_tokens": 26085160.0,
|
|
"step": 14145
|
|
},
|
|
{
|
|
"entropy": 5.671110486984253,
|
|
"epoch": 1.1887838689350976,
|
|
"grad_norm": 1.4453125,
|
|
"learning_rate": 0.00048635768249241434,
|
|
"loss": 5.3997,
|
|
"mean_token_accuracy": 0.1647752344608307,
|
|
"num_tokens": 26094157.0,
|
|
"step": 14150
|
|
},
|
|
{
|
|
"entropy": 5.748544454574585,
|
|
"epoch": 1.1892039487502626,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.0004863474115848685,
|
|
"loss": 5.5073,
|
|
"mean_token_accuracy": 0.16841865181922913,
|
|
"num_tokens": 26104459.0,
|
|
"step": 14155
|
|
},
|
|
{
|
|
"entropy": 5.6415163516998295,
|
|
"epoch": 1.1896240285654274,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.00048633713693341214,
|
|
"loss": 5.4513,
|
|
"mean_token_accuracy": 0.16421539783477784,
|
|
"num_tokens": 26114468.0,
|
|
"step": 14160
|
|
},
|
|
{
|
|
"entropy": 5.602667903900146,
|
|
"epoch": 1.1900441083805924,
|
|
"grad_norm": 1.703125,
|
|
"learning_rate": 0.00048632685853822714,
|
|
"loss": 5.4233,
|
|
"mean_token_accuracy": 0.16077221781015397,
|
|
"num_tokens": 26123408.0,
|
|
"step": 14165
|
|
},
|
|
{
|
|
"entropy": 5.65820803642273,
|
|
"epoch": 1.1904641881957572,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004863165763994957,
|
|
"loss": 5.4277,
|
|
"mean_token_accuracy": 0.16007840856909752,
|
|
"num_tokens": 26132692.0,
|
|
"step": 14170
|
|
},
|
|
{
|
|
"entropy": 5.831601476669311,
|
|
"epoch": 1.190884268010922,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004863062905173999,
|
|
"loss": 5.6045,
|
|
"mean_token_accuracy": 0.15579294115304948,
|
|
"num_tokens": 26142259.0,
|
|
"step": 14175
|
|
},
|
|
{
|
|
"entropy": 5.72370457649231,
|
|
"epoch": 1.191304347826087,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000486296000892122,
|
|
"loss": 5.457,
|
|
"mean_token_accuracy": 0.15974125415086746,
|
|
"num_tokens": 26151782.0,
|
|
"step": 14180
|
|
},
|
|
{
|
|
"entropy": 5.591598129272461,
|
|
"epoch": 1.1917244276412517,
|
|
"grad_norm": 4.0625,
|
|
"learning_rate": 0.00048628570752384424,
|
|
"loss": 5.2925,
|
|
"mean_token_accuracy": 0.17002766877412795,
|
|
"num_tokens": 26160449.0,
|
|
"step": 14185
|
|
},
|
|
{
|
|
"entropy": 5.6798159122467045,
|
|
"epoch": 1.1921445074564168,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048627541041274897,
|
|
"loss": 5.5294,
|
|
"mean_token_accuracy": 0.15847962498664855,
|
|
"num_tokens": 26169764.0,
|
|
"step": 14190
|
|
},
|
|
{
|
|
"entropy": 5.6722887516021725,
|
|
"epoch": 1.1925645872715815,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048626510955901854,
|
|
"loss": 5.388,
|
|
"mean_token_accuracy": 0.16532097309827803,
|
|
"num_tokens": 26178759.0,
|
|
"step": 14195
|
|
},
|
|
{
|
|
"entropy": 5.752083778381348,
|
|
"epoch": 1.1929846670867466,
|
|
"grad_norm": 1.734375,
|
|
"learning_rate": 0.0004862548049628356,
|
|
"loss": 5.5175,
|
|
"mean_token_accuracy": 0.1655479848384857,
|
|
"num_tokens": 26187904.0,
|
|
"step": 14200
|
|
},
|
|
{
|
|
"entropy": 5.709446907043457,
|
|
"epoch": 1.1934047469019113,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.0004862444966243824,
|
|
"loss": 5.4362,
|
|
"mean_token_accuracy": 0.16655133068561553,
|
|
"num_tokens": 26196563.0,
|
|
"step": 14205
|
|
},
|
|
{
|
|
"entropy": 5.716876649856568,
|
|
"epoch": 1.1938248267170763,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004862341845438419,
|
|
"loss": 5.4618,
|
|
"mean_token_accuracy": 0.16468843668699265,
|
|
"num_tokens": 26206573.0,
|
|
"step": 14210
|
|
},
|
|
{
|
|
"entropy": 5.640651273727417,
|
|
"epoch": 1.1942449065322411,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.00048622386872139645,
|
|
"loss": 5.3703,
|
|
"mean_token_accuracy": 0.16637713611125945,
|
|
"num_tokens": 26215308.0,
|
|
"step": 14215
|
|
},
|
|
{
|
|
"entropy": 5.623022794723511,
|
|
"epoch": 1.194664986347406,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000486213549157229,
|
|
"loss": 5.4308,
|
|
"mean_token_accuracy": 0.1651561066508293,
|
|
"num_tokens": 26224379.0,
|
|
"step": 14220
|
|
},
|
|
{
|
|
"entropy": 5.606683444976807,
|
|
"epoch": 1.195085066162571,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.0004862032258515222,
|
|
"loss": 5.4237,
|
|
"mean_token_accuracy": 0.1671927824616432,
|
|
"num_tokens": 26233620.0,
|
|
"step": 14225
|
|
},
|
|
{
|
|
"entropy": 5.685428619384766,
|
|
"epoch": 1.1955051459777357,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004861928988044592,
|
|
"loss": 5.4885,
|
|
"mean_token_accuracy": 0.16064939498901368,
|
|
"num_tokens": 26242556.0,
|
|
"step": 14230
|
|
},
|
|
{
|
|
"entropy": 5.704376649856568,
|
|
"epoch": 1.1959252257929007,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004861825680162226,
|
|
"loss": 5.4799,
|
|
"mean_token_accuracy": 0.16530386954545975,
|
|
"num_tokens": 26251561.0,
|
|
"step": 14235
|
|
},
|
|
{
|
|
"entropy": 5.671692752838135,
|
|
"epoch": 1.1963453056080655,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.00048617223348699546,
|
|
"loss": 5.395,
|
|
"mean_token_accuracy": 0.1631077140569687,
|
|
"num_tokens": 26261115.0,
|
|
"step": 14240
|
|
},
|
|
{
|
|
"entropy": 5.7645580768585205,
|
|
"epoch": 1.1967653854232303,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.0004861618952169611,
|
|
"loss": 5.5554,
|
|
"mean_token_accuracy": 0.1650903783738613,
|
|
"num_tokens": 26271165.0,
|
|
"step": 14245
|
|
},
|
|
{
|
|
"entropy": 5.668324375152588,
|
|
"epoch": 1.1971854652383953,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004861515532063025,
|
|
"loss": 5.5081,
|
|
"mean_token_accuracy": 0.16101304292678834,
|
|
"num_tokens": 26280822.0,
|
|
"step": 14250
|
|
},
|
|
{
|
|
"entropy": 5.678185224533081,
|
|
"epoch": 1.19760554505356,
|
|
"grad_norm": 1.2734375,
|
|
"learning_rate": 0.00048614120745520275,
|
|
"loss": 5.3902,
|
|
"mean_token_accuracy": 0.16933465003967285,
|
|
"num_tokens": 26288747.0,
|
|
"step": 14255
|
|
},
|
|
{
|
|
"entropy": 5.661726713180542,
|
|
"epoch": 1.198025624868725,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.00048613085796384524,
|
|
"loss": 5.4684,
|
|
"mean_token_accuracy": 0.1568397268652916,
|
|
"num_tokens": 26298387.0,
|
|
"step": 14260
|
|
},
|
|
{
|
|
"entropy": 5.607639598846435,
|
|
"epoch": 1.19844570468389,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048612050473241335,
|
|
"loss": 5.3588,
|
|
"mean_token_accuracy": 0.16898838877677919,
|
|
"num_tokens": 26307016.0,
|
|
"step": 14265
|
|
},
|
|
{
|
|
"entropy": 5.663268566131592,
|
|
"epoch": 1.198865784499055,
|
|
"grad_norm": 1.375,
|
|
"learning_rate": 0.0004861101477610905,
|
|
"loss": 5.4788,
|
|
"mean_token_accuracy": 0.1621081128716469,
|
|
"num_tokens": 26316296.0,
|
|
"step": 14270
|
|
},
|
|
{
|
|
"entropy": 5.6753627300262455,
|
|
"epoch": 1.1992858643142197,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048609978705006,
|
|
"loss": 5.464,
|
|
"mean_token_accuracy": 0.16156177669763566,
|
|
"num_tokens": 26325525.0,
|
|
"step": 14275
|
|
},
|
|
{
|
|
"entropy": 5.665592432022095,
|
|
"epoch": 1.1997059441293847,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004860894225995055,
|
|
"loss": 5.3402,
|
|
"mean_token_accuracy": 0.16874586492776872,
|
|
"num_tokens": 26334195.0,
|
|
"step": 14280
|
|
},
|
|
{
|
|
"entropy": 5.6383813381195065,
|
|
"epoch": 1.2001260239445495,
|
|
"grad_norm": 1.3828125,
|
|
"learning_rate": 0.00048607905440961054,
|
|
"loss": 5.4703,
|
|
"mean_token_accuracy": 0.1640348732471466,
|
|
"num_tokens": 26343933.0,
|
|
"step": 14285
|
|
},
|
|
{
|
|
"entropy": 5.629398059844971,
|
|
"epoch": 1.2005461037597143,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048606868248055887,
|
|
"loss": 5.422,
|
|
"mean_token_accuracy": 0.16718345433473586,
|
|
"num_tokens": 26353455.0,
|
|
"step": 14290
|
|
},
|
|
{
|
|
"entropy": 5.7174492359161375,
|
|
"epoch": 1.2009661835748793,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.0004860583068125341,
|
|
"loss": 5.4115,
|
|
"mean_token_accuracy": 0.17318314611911773,
|
|
"num_tokens": 26362662.0,
|
|
"step": 14295
|
|
},
|
|
{
|
|
"entropy": 5.647911453247071,
|
|
"epoch": 1.201386263390044,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.0004860479274057202,
|
|
"loss": 5.4264,
|
|
"mean_token_accuracy": 0.16339855939149855,
|
|
"num_tokens": 26371536.0,
|
|
"step": 14300
|
|
},
|
|
{
|
|
"entropy": 5.729653596878052,
|
|
"epoch": 1.201806343205209,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.00048603754426030087,
|
|
"loss": 5.52,
|
|
"mean_token_accuracy": 0.16162212640047074,
|
|
"num_tokens": 26381925.0,
|
|
"step": 14305
|
|
},
|
|
{
|
|
"entropy": 5.602517032623291,
|
|
"epoch": 1.2022264230203739,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.00048602715737646016,
|
|
"loss": 5.3863,
|
|
"mean_token_accuracy": 0.16718171089887618,
|
|
"num_tokens": 26391111.0,
|
|
"step": 14310
|
|
},
|
|
{
|
|
"entropy": 5.763671970367431,
|
|
"epoch": 1.2026465028355386,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048601676675438197,
|
|
"loss": 5.5468,
|
|
"mean_token_accuracy": 0.1514241561293602,
|
|
"num_tokens": 26401667.0,
|
|
"step": 14315
|
|
},
|
|
{
|
|
"entropy": 5.683077478408814,
|
|
"epoch": 1.2030665826507037,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.00048600637239425045,
|
|
"loss": 5.3705,
|
|
"mean_token_accuracy": 0.1719566836953163,
|
|
"num_tokens": 26411261.0,
|
|
"step": 14320
|
|
},
|
|
{
|
|
"entropy": 5.641879558563232,
|
|
"epoch": 1.2034866624658684,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00048599597429624966,
|
|
"loss": 5.5082,
|
|
"mean_token_accuracy": 0.16119082495570183,
|
|
"num_tokens": 26419808.0,
|
|
"step": 14325
|
|
},
|
|
{
|
|
"entropy": 5.675163888931275,
|
|
"epoch": 1.2039067422810334,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048598557246056385,
|
|
"loss": 5.4305,
|
|
"mean_token_accuracy": 0.16555157899856568,
|
|
"num_tokens": 26429160.0,
|
|
"step": 14330
|
|
},
|
|
{
|
|
"entropy": 5.699215984344482,
|
|
"epoch": 1.2043268220961982,
|
|
"grad_norm": 2.046875,
|
|
"learning_rate": 0.00048597516688737727,
|
|
"loss": 5.3938,
|
|
"mean_token_accuracy": 0.16645348966121673,
|
|
"num_tokens": 26437675.0,
|
|
"step": 14335
|
|
},
|
|
{
|
|
"entropy": 5.680204963684082,
|
|
"epoch": 1.2047469019113632,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00048596475757687425,
|
|
"loss": 5.449,
|
|
"mean_token_accuracy": 0.16452098041772842,
|
|
"num_tokens": 26446317.0,
|
|
"step": 14340
|
|
},
|
|
{
|
|
"entropy": 5.699273586273193,
|
|
"epoch": 1.205166981726528,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048595434452923915,
|
|
"loss": 5.4949,
|
|
"mean_token_accuracy": 0.16454333513975145,
|
|
"num_tokens": 26456183.0,
|
|
"step": 14345
|
|
},
|
|
{
|
|
"entropy": 5.6539466857910154,
|
|
"epoch": 1.205587061541693,
|
|
"grad_norm": 1.4375,
|
|
"learning_rate": 0.00048594392774465656,
|
|
"loss": 5.4311,
|
|
"mean_token_accuracy": 0.15812044590711594,
|
|
"num_tokens": 26466324.0,
|
|
"step": 14350
|
|
},
|
|
{
|
|
"entropy": 5.667297124862671,
|
|
"epoch": 1.2060071413568578,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048593350722331074,
|
|
"loss": 5.4471,
|
|
"mean_token_accuracy": 0.16772420853376388,
|
|
"num_tokens": 26475560.0,
|
|
"step": 14355
|
|
},
|
|
{
|
|
"entropy": 5.610403156280517,
|
|
"epoch": 1.2064272211720226,
|
|
"grad_norm": 1.875,
|
|
"learning_rate": 0.00048592308296538654,
|
|
"loss": 5.4173,
|
|
"mean_token_accuracy": 0.16000428274273873,
|
|
"num_tokens": 26484955.0,
|
|
"step": 14360
|
|
},
|
|
{
|
|
"entropy": 5.65930495262146,
|
|
"epoch": 1.2068473009871876,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004859126549710686,
|
|
"loss": 5.3811,
|
|
"mean_token_accuracy": 0.1743646025657654,
|
|
"num_tokens": 26494306.0,
|
|
"step": 14365
|
|
},
|
|
{
|
|
"entropy": 5.64382495880127,
|
|
"epoch": 1.2072673808023524,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.00048590222324054153,
|
|
"loss": 5.3942,
|
|
"mean_token_accuracy": 0.1713266119360924,
|
|
"num_tokens": 26503871.0,
|
|
"step": 14370
|
|
},
|
|
{
|
|
"entropy": 5.720830965042114,
|
|
"epoch": 1.2076874606175174,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004858917877739901,
|
|
"loss": 5.4942,
|
|
"mean_token_accuracy": 0.16498788744211196,
|
|
"num_tokens": 26511929.0,
|
|
"step": 14375
|
|
},
|
|
{
|
|
"entropy": 5.695744895935059,
|
|
"epoch": 1.2081075404326822,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.0004858813485715994,
|
|
"loss": 5.4881,
|
|
"mean_token_accuracy": 0.15547695755958557,
|
|
"num_tokens": 26520469.0,
|
|
"step": 14380
|
|
},
|
|
{
|
|
"entropy": 5.6886962890625,
|
|
"epoch": 1.208527620247847,
|
|
"grad_norm": 1.6015625,
|
|
"learning_rate": 0.0004858709056335541,
|
|
"loss": 5.458,
|
|
"mean_token_accuracy": 0.16531693637371064,
|
|
"num_tokens": 26530102.0,
|
|
"step": 14385
|
|
},
|
|
{
|
|
"entropy": 5.668837690353394,
|
|
"epoch": 1.208947700063012,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048586045896003926,
|
|
"loss": 5.4623,
|
|
"mean_token_accuracy": 0.16233531534671783,
|
|
"num_tokens": 26538705.0,
|
|
"step": 14390
|
|
},
|
|
{
|
|
"entropy": 5.7534499168396,
|
|
"epoch": 1.2093677798781768,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.0004858500085512401,
|
|
"loss": 5.5504,
|
|
"mean_token_accuracy": 0.16140262633562089,
|
|
"num_tokens": 26548315.0,
|
|
"step": 14395
|
|
},
|
|
{
|
|
"entropy": 5.734239721298218,
|
|
"epoch": 1.2097878596933418,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048583955440734144,
|
|
"loss": 5.3839,
|
|
"mean_token_accuracy": 0.16452737748622895,
|
|
"num_tokens": 26556412.0,
|
|
"step": 14400
|
|
},
|
|
{
|
|
"entropy": 5.757485628128052,
|
|
"epoch": 1.2102079395085066,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.00048582909652852873,
|
|
"loss": 5.5363,
|
|
"mean_token_accuracy": 0.15494186282157899,
|
|
"num_tokens": 26566146.0,
|
|
"step": 14405
|
|
},
|
|
{
|
|
"entropy": 5.666713428497315,
|
|
"epoch": 1.2106280193236716,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004858186349149871,
|
|
"loss": 5.4291,
|
|
"mean_token_accuracy": 0.16879764646291734,
|
|
"num_tokens": 26576019.0,
|
|
"step": 14410
|
|
},
|
|
{
|
|
"entropy": 5.547569131851196,
|
|
"epoch": 1.2110480991388364,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.000485808169566902,
|
|
"loss": 5.2952,
|
|
"mean_token_accuracy": 0.16971922367811204,
|
|
"num_tokens": 26585461.0,
|
|
"step": 14415
|
|
},
|
|
{
|
|
"entropy": 5.549987030029297,
|
|
"epoch": 1.2114681789540014,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.00048579770048445863,
|
|
"loss": 5.3289,
|
|
"mean_token_accuracy": 0.18390253633260728,
|
|
"num_tokens": 26594021.0,
|
|
"step": 14420
|
|
},
|
|
{
|
|
"entropy": 5.748401546478272,
|
|
"epoch": 1.2118882587691662,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048578722766784253,
|
|
"loss": 5.481,
|
|
"mean_token_accuracy": 0.16007982641458512,
|
|
"num_tokens": 26602712.0,
|
|
"step": 14425
|
|
},
|
|
{
|
|
"entropy": 5.627417659759521,
|
|
"epoch": 1.212308338584331,
|
|
"grad_norm": 1.765625,
|
|
"learning_rate": 0.00048577675111723925,
|
|
"loss": 5.1783,
|
|
"mean_token_accuracy": 0.1806653767824173,
|
|
"num_tokens": 26610970.0,
|
|
"step": 14430
|
|
},
|
|
{
|
|
"entropy": 5.627402400970459,
|
|
"epoch": 1.212728418399496,
|
|
"grad_norm": 1.9921875,
|
|
"learning_rate": 0.00048576627083283435,
|
|
"loss": 5.4717,
|
|
"mean_token_accuracy": 0.1654764398932457,
|
|
"num_tokens": 26619840.0,
|
|
"step": 14435
|
|
},
|
|
{
|
|
"entropy": 5.632164764404297,
|
|
"epoch": 1.2131484982146608,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004857557868148136,
|
|
"loss": 5.3626,
|
|
"mean_token_accuracy": 0.17133797258138656,
|
|
"num_tokens": 26629271.0,
|
|
"step": 14440
|
|
},
|
|
{
|
|
"entropy": 5.661153793334961,
|
|
"epoch": 1.2135685780298258,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.0004857452990633625,
|
|
"loss": 5.4142,
|
|
"mean_token_accuracy": 0.1605449289083481,
|
|
"num_tokens": 26638610.0,
|
|
"step": 14445
|
|
},
|
|
{
|
|
"entropy": 5.73473424911499,
|
|
"epoch": 1.2139886578449905,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.00048573480757866695,
|
|
"loss": 5.5548,
|
|
"mean_token_accuracy": 0.1624301940202713,
|
|
"num_tokens": 26648504.0,
|
|
"step": 14450
|
|
},
|
|
{
|
|
"entropy": 5.697289085388183,
|
|
"epoch": 1.2144087376601553,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.00048572431236091284,
|
|
"loss": 5.4489,
|
|
"mean_token_accuracy": 0.1593151181936264,
|
|
"num_tokens": 26658084.0,
|
|
"step": 14455
|
|
},
|
|
{
|
|
"entropy": 5.722338914871216,
|
|
"epoch": 1.2148288174753203,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.00048571381341028604,
|
|
"loss": 5.5185,
|
|
"mean_token_accuracy": 0.16407164931297302,
|
|
"num_tokens": 26666933.0,
|
|
"step": 14460
|
|
},
|
|
{
|
|
"entropy": 5.739486169815064,
|
|
"epoch": 1.2152488972904851,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004857033107269725,
|
|
"loss": 5.4105,
|
|
"mean_token_accuracy": 0.16758025884628297,
|
|
"num_tokens": 26675049.0,
|
|
"step": 14465
|
|
},
|
|
{
|
|
"entropy": 5.621294927597046,
|
|
"epoch": 1.2156689771056501,
|
|
"grad_norm": 2.15625,
|
|
"learning_rate": 0.00048569280431115823,
|
|
"loss": 5.4754,
|
|
"mean_token_accuracy": 0.16362982392311096,
|
|
"num_tokens": 26684223.0,
|
|
"step": 14470
|
|
},
|
|
{
|
|
"entropy": 5.661048316955567,
|
|
"epoch": 1.216089056920815,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004856822941630296,
|
|
"loss": 5.4118,
|
|
"mean_token_accuracy": 0.160876327753067,
|
|
"num_tokens": 26693605.0,
|
|
"step": 14475
|
|
},
|
|
{
|
|
"entropy": 5.75145697593689,
|
|
"epoch": 1.2165091367359797,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048567178028277255,
|
|
"loss": 5.473,
|
|
"mean_token_accuracy": 0.1669105038046837,
|
|
"num_tokens": 26702829.0,
|
|
"step": 14480
|
|
},
|
|
{
|
|
"entropy": 5.730476236343383,
|
|
"epoch": 1.2169292165511447,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004856612626705733,
|
|
"loss": 5.5238,
|
|
"mean_token_accuracy": 0.15920734852552415,
|
|
"num_tokens": 26712466.0,
|
|
"step": 14485
|
|
},
|
|
{
|
|
"entropy": 5.672338390350342,
|
|
"epoch": 1.2173492963663095,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004856507413266183,
|
|
"loss": 5.4036,
|
|
"mean_token_accuracy": 0.1744743689894676,
|
|
"num_tokens": 26721730.0,
|
|
"step": 14490
|
|
},
|
|
{
|
|
"entropy": 5.648630571365357,
|
|
"epoch": 1.2177693761814745,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.000485640216251094,
|
|
"loss": 5.476,
|
|
"mean_token_accuracy": 0.16147005409002305,
|
|
"num_tokens": 26731017.0,
|
|
"step": 14495
|
|
},
|
|
{
|
|
"entropy": 5.728749513626099,
|
|
"epoch": 1.2181894559966393,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.00048562968744418665,
|
|
"loss": 5.4411,
|
|
"mean_token_accuracy": 0.16135525703430176,
|
|
"num_tokens": 26739588.0,
|
|
"step": 14500
|
|
},
|
|
{
|
|
"entropy": 5.750088310241699,
|
|
"epoch": 1.2186095358118043,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.0004856191549060828,
|
|
"loss": 5.5545,
|
|
"mean_token_accuracy": 0.15382490158081055,
|
|
"num_tokens": 26748889.0,
|
|
"step": 14505
|
|
},
|
|
{
|
|
"entropy": 5.731989622116089,
|
|
"epoch": 1.219029615626969,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.00048560861863696913,
|
|
"loss": 5.503,
|
|
"mean_token_accuracy": 0.16170719712972642,
|
|
"num_tokens": 26757979.0,
|
|
"step": 14510
|
|
},
|
|
{
|
|
"entropy": 5.755252075195313,
|
|
"epoch": 1.219449695442134,
|
|
"grad_norm": 2.171875,
|
|
"learning_rate": 0.0004855980786370322,
|
|
"loss": 5.4249,
|
|
"mean_token_accuracy": 0.16454763114452362,
|
|
"num_tokens": 26767225.0,
|
|
"step": 14515
|
|
},
|
|
{
|
|
"entropy": 5.638862180709839,
|
|
"epoch": 1.219869775257299,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004855875349064588,
|
|
"loss": 5.3742,
|
|
"mean_token_accuracy": 0.16612455993890762,
|
|
"num_tokens": 26776289.0,
|
|
"step": 14520
|
|
},
|
|
{
|
|
"entropy": 5.709742593765259,
|
|
"epoch": 1.2202898550724637,
|
|
"grad_norm": 1.65625,
|
|
"learning_rate": 0.0004855769874454356,
|
|
"loss": 5.4903,
|
|
"mean_token_accuracy": 0.15808172821998595,
|
|
"num_tokens": 26785631.0,
|
|
"step": 14525
|
|
},
|
|
{
|
|
"entropy": 5.647239732742309,
|
|
"epoch": 1.2207099348876287,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004855664362541495,
|
|
"loss": 5.5121,
|
|
"mean_token_accuracy": 0.15897773057222367,
|
|
"num_tokens": 26795285.0,
|
|
"step": 14530
|
|
},
|
|
{
|
|
"entropy": 5.628441572189331,
|
|
"epoch": 1.2211300147027935,
|
|
"grad_norm": 1.640625,
|
|
"learning_rate": 0.00048555588133278744,
|
|
"loss": 5.4128,
|
|
"mean_token_accuracy": 0.16046550869941711,
|
|
"num_tokens": 26804584.0,
|
|
"step": 14535
|
|
},
|
|
{
|
|
"entropy": 5.607101106643677,
|
|
"epoch": 1.2215500945179585,
|
|
"grad_norm": 1.46875,
|
|
"learning_rate": 0.0004855453226815363,
|
|
"loss": 5.3055,
|
|
"mean_token_accuracy": 0.16931912302970886,
|
|
"num_tokens": 26814354.0,
|
|
"step": 14540
|
|
},
|
|
{
|
|
"entropy": 5.595360374450683,
|
|
"epoch": 1.2219701743331233,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048553476030058326,
|
|
"loss": 5.3156,
|
|
"mean_token_accuracy": 0.1787246897816658,
|
|
"num_tokens": 26824274.0,
|
|
"step": 14545
|
|
},
|
|
{
|
|
"entropy": 5.593567895889282,
|
|
"epoch": 1.222390254148288,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048552419419011536,
|
|
"loss": 5.4488,
|
|
"mean_token_accuracy": 0.16586222499608994,
|
|
"num_tokens": 26833155.0,
|
|
"step": 14550
|
|
},
|
|
{
|
|
"entropy": 5.637729072570801,
|
|
"epoch": 1.222810333963453,
|
|
"grad_norm": 1.984375,
|
|
"learning_rate": 0.0004855136243503196,
|
|
"loss": 5.3859,
|
|
"mean_token_accuracy": 0.16326625347137452,
|
|
"num_tokens": 26842545.0,
|
|
"step": 14555
|
|
},
|
|
{
|
|
"entropy": 5.714113759994507,
|
|
"epoch": 1.2232304137786179,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048550305078138363,
|
|
"loss": 5.4666,
|
|
"mean_token_accuracy": 0.16542484909296035,
|
|
"num_tokens": 26851772.0,
|
|
"step": 14560
|
|
},
|
|
{
|
|
"entropy": 5.639671373367309,
|
|
"epoch": 1.2236504935937829,
|
|
"grad_norm": 2.1875,
|
|
"learning_rate": 0.00048549247348349435,
|
|
"loss": 5.3469,
|
|
"mean_token_accuracy": 0.16841816902160645,
|
|
"num_tokens": 26860884.0,
|
|
"step": 14565
|
|
},
|
|
{
|
|
"entropy": 5.659556913375854,
|
|
"epoch": 1.2240705734089476,
|
|
"grad_norm": 1.859375,
|
|
"learning_rate": 0.00048548189245683934,
|
|
"loss": 5.4783,
|
|
"mean_token_accuracy": 0.16680554449558258,
|
|
"num_tokens": 26869435.0,
|
|
"step": 14570
|
|
},
|
|
{
|
|
"entropy": 5.661346864700318,
|
|
"epoch": 1.2244906532241127,
|
|
"grad_norm": 1.3984375,
|
|
"learning_rate": 0.00048547130770160596,
|
|
"loss": 5.38,
|
|
"mean_token_accuracy": 0.16684748679399491,
|
|
"num_tokens": 26878852.0,
|
|
"step": 14575
|
|
},
|
|
{
|
|
"entropy": 5.670219469070434,
|
|
"epoch": 1.2249107330392774,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004854607192179817,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.16878951340913773,
|
|
"num_tokens": 26887532.0,
|
|
"step": 14580
|
|
},
|
|
{
|
|
"entropy": 5.798138046264649,
|
|
"epoch": 1.2253308128544425,
|
|
"grad_norm": 2.09375,
|
|
"learning_rate": 0.0004854501270061543,
|
|
"loss": 5.5841,
|
|
"mean_token_accuracy": 0.16193009316921234,
|
|
"num_tokens": 26897459.0,
|
|
"step": 14585
|
|
},
|
|
{
|
|
"entropy": 5.631389856338501,
|
|
"epoch": 1.2257508926696072,
|
|
"grad_norm": 1.59375,
|
|
"learning_rate": 0.00048543953106631115,
|
|
"loss": 5.3437,
|
|
"mean_token_accuracy": 0.17152508199214936,
|
|
"num_tokens": 26907156.0,
|
|
"step": 14590
|
|
},
|
|
{
|
|
"entropy": 5.7155900478363035,
|
|
"epoch": 1.226170972484772,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004854289313986401,
|
|
"loss": 5.4399,
|
|
"mean_token_accuracy": 0.1658498004078865,
|
|
"num_tokens": 26915764.0,
|
|
"step": 14595
|
|
},
|
|
{
|
|
"entropy": 5.578080129623413,
|
|
"epoch": 1.226591052299937,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004854183280033289,
|
|
"loss": 5.293,
|
|
"mean_token_accuracy": 0.17005468606948854,
|
|
"num_tokens": 26924166.0,
|
|
"step": 14600
|
|
},
|
|
{
|
|
"entropy": 5.705555152893067,
|
|
"epoch": 1.2270111321151018,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004854077208805654,
|
|
"loss": 5.5469,
|
|
"mean_token_accuracy": 0.1581970199942589,
|
|
"num_tokens": 26933546.0,
|
|
"step": 14605
|
|
},
|
|
{
|
|
"entropy": 5.711961030960083,
|
|
"epoch": 1.2274312119302668,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004853971100305374,
|
|
"loss": 5.4648,
|
|
"mean_token_accuracy": 0.16808405220508577,
|
|
"num_tokens": 26943213.0,
|
|
"step": 14610
|
|
},
|
|
{
|
|
"entropy": 5.702235364913941,
|
|
"epoch": 1.2278512917454316,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.000485386495453433,
|
|
"loss": 5.4126,
|
|
"mean_token_accuracy": 0.1717974692583084,
|
|
"num_tokens": 26952968.0,
|
|
"step": 14615
|
|
},
|
|
{
|
|
"entropy": 5.671736001968384,
|
|
"epoch": 1.2282713715605964,
|
|
"grad_norm": 1.4296875,
|
|
"learning_rate": 0.00048537587714944007,
|
|
"loss": 5.41,
|
|
"mean_token_accuracy": 0.16740047186613083,
|
|
"num_tokens": 26962230.0,
|
|
"step": 14620
|
|
},
|
|
{
|
|
"entropy": 5.6375446796417235,
|
|
"epoch": 1.2286914513757614,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.0004853652551187469,
|
|
"loss": 5.4774,
|
|
"mean_token_accuracy": 0.17203135490417482,
|
|
"num_tokens": 26970985.0,
|
|
"step": 14625
|
|
},
|
|
{
|
|
"entropy": 5.717348384857178,
|
|
"epoch": 1.2291115311909262,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.00048535462936154147,
|
|
"loss": 5.5114,
|
|
"mean_token_accuracy": 0.160255528986454,
|
|
"num_tokens": 26981138.0,
|
|
"step": 14630
|
|
},
|
|
{
|
|
"entropy": 5.623097467422485,
|
|
"epoch": 1.2295316110060912,
|
|
"grad_norm": 1.96875,
|
|
"learning_rate": 0.0004853439998780122,
|
|
"loss": 5.3563,
|
|
"mean_token_accuracy": 0.17389864474534988,
|
|
"num_tokens": 26990158.0,
|
|
"step": 14635
|
|
},
|
|
{
|
|
"entropy": 5.647943782806396,
|
|
"epoch": 1.229951690821256,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.0004853333666683472,
|
|
"loss": 5.507,
|
|
"mean_token_accuracy": 0.15743157267570496,
|
|
"num_tokens": 26998889.0,
|
|
"step": 14640
|
|
},
|
|
{
|
|
"entropy": 5.691283798217773,
|
|
"epoch": 1.230371770636421,
|
|
"grad_norm": 1.6640625,
|
|
"learning_rate": 0.00048532272973273496,
|
|
"loss": 5.4327,
|
|
"mean_token_accuracy": 0.16522250920534134,
|
|
"num_tokens": 27008912.0,
|
|
"step": 14645
|
|
},
|
|
{
|
|
"entropy": 5.639300966262818,
|
|
"epoch": 1.2307918504515858,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048531208907136384,
|
|
"loss": 5.3326,
|
|
"mean_token_accuracy": 0.17845280468463898,
|
|
"num_tokens": 27017573.0,
|
|
"step": 14650
|
|
},
|
|
{
|
|
"entropy": 5.629958868026733,
|
|
"epoch": 1.2312119302667508,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048530144468442236,
|
|
"loss": 5.4223,
|
|
"mean_token_accuracy": 0.15466590747237205,
|
|
"num_tokens": 27027205.0,
|
|
"step": 14655
|
|
},
|
|
{
|
|
"entropy": 5.702744960784912,
|
|
"epoch": 1.2316320100819156,
|
|
"grad_norm": 1.6875,
|
|
"learning_rate": 0.00048529079657209906,
|
|
"loss": 5.3471,
|
|
"mean_token_accuracy": 0.1684443920850754,
|
|
"num_tokens": 27035882.0,
|
|
"step": 14660
|
|
},
|
|
{
|
|
"entropy": 5.582692718505859,
|
|
"epoch": 1.2320520898970804,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004852801447345826,
|
|
"loss": 5.4266,
|
|
"mean_token_accuracy": 0.17018674015998841,
|
|
"num_tokens": 27044761.0,
|
|
"step": 14665
|
|
},
|
|
{
|
|
"entropy": 5.673105096817016,
|
|
"epoch": 1.2324721697122454,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004852694891720617,
|
|
"loss": 5.4613,
|
|
"mean_token_accuracy": 0.16327601224184035,
|
|
"num_tokens": 27054149.0,
|
|
"step": 14670
|
|
},
|
|
{
|
|
"entropy": 5.7073362350463865,
|
|
"epoch": 1.2328922495274102,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000485258829884725,
|
|
"loss": 5.49,
|
|
"mean_token_accuracy": 0.16526736468076705,
|
|
"num_tokens": 27063145.0,
|
|
"step": 14675
|
|
},
|
|
{
|
|
"entropy": 5.735353565216064,
|
|
"epoch": 1.2333123293425752,
|
|
"grad_norm": 1.7265625,
|
|
"learning_rate": 0.0004852481668727614,
|
|
"loss": 5.4537,
|
|
"mean_token_accuracy": 0.16542838364839554,
|
|
"num_tokens": 27072378.0,
|
|
"step": 14680
|
|
},
|
|
{
|
|
"entropy": 5.6080183506011965,
|
|
"epoch": 1.23373240915774,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048523750013635986,
|
|
"loss": 5.3308,
|
|
"mean_token_accuracy": 0.16522957533597946,
|
|
"num_tokens": 27082241.0,
|
|
"step": 14685
|
|
},
|
|
{
|
|
"entropy": 5.59213662147522,
|
|
"epoch": 1.2341524889729047,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004852268296757092,
|
|
"loss": 5.3492,
|
|
"mean_token_accuracy": 0.16783910393714904,
|
|
"num_tokens": 27091488.0,
|
|
"step": 14690
|
|
},
|
|
{
|
|
"entropy": 5.69573221206665,
|
|
"epoch": 1.2345725687880698,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004852161554909985,
|
|
"loss": 5.4111,
|
|
"mean_token_accuracy": 0.16741783171892166,
|
|
"num_tokens": 27100378.0,
|
|
"step": 14695
|
|
},
|
|
{
|
|
"entropy": 5.6991900444030765,
|
|
"epoch": 1.2349926486032345,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.00048520547758241686,
|
|
"loss": 5.4165,
|
|
"mean_token_accuracy": 0.1690449208021164,
|
|
"num_tokens": 27110341.0,
|
|
"step": 14700
|
|
},
|
|
{
|
|
"entropy": 5.683678340911865,
|
|
"epoch": 1.2354127284183996,
|
|
"grad_norm": 1.5859375,
|
|
"learning_rate": 0.00048519479595015343,
|
|
"loss": 5.3629,
|
|
"mean_token_accuracy": 0.16223382204771042,
|
|
"num_tokens": 27119381.0,
|
|
"step": 14705
|
|
},
|
|
{
|
|
"entropy": 5.618437147140503,
|
|
"epoch": 1.2358328082335643,
|
|
"grad_norm": 1.578125,
|
|
"learning_rate": 0.00048518411059439746,
|
|
"loss": 5.4655,
|
|
"mean_token_accuracy": 0.15772671699523927,
|
|
"num_tokens": 27129167.0,
|
|
"step": 14710
|
|
},
|
|
{
|
|
"entropy": 5.634173774719239,
|
|
"epoch": 1.2362528880487293,
|
|
"grad_norm": 1.8984375,
|
|
"learning_rate": 0.00048517342151533813,
|
|
"loss": 5.4712,
|
|
"mean_token_accuracy": 0.1590983271598816,
|
|
"num_tokens": 27138479.0,
|
|
"step": 14715
|
|
},
|
|
{
|
|
"entropy": 5.642277622222901,
|
|
"epoch": 1.2366729678638941,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.0004851627287131649,
|
|
"loss": 5.3531,
|
|
"mean_token_accuracy": 0.1717111274600029,
|
|
"num_tokens": 27147197.0,
|
|
"step": 14720
|
|
},
|
|
{
|
|
"entropy": 5.628361082077026,
|
|
"epoch": 1.2370930476790591,
|
|
"grad_norm": 1.8671875,
|
|
"learning_rate": 0.0004851520321880672,
|
|
"loss": 5.3891,
|
|
"mean_token_accuracy": 0.17068208754062653,
|
|
"num_tokens": 27155854.0,
|
|
"step": 14725
|
|
},
|
|
{
|
|
"entropy": 5.6246418952941895,
|
|
"epoch": 1.237513127494224,
|
|
"grad_norm": 1.9140625,
|
|
"learning_rate": 0.0004851413319402344,
|
|
"loss": 5.3439,
|
|
"mean_token_accuracy": 0.16404491513967515,
|
|
"num_tokens": 27165069.0,
|
|
"step": 14730
|
|
},
|
|
{
|
|
"entropy": 5.666826486587524,
|
|
"epoch": 1.2379332073093887,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004851306279698561,
|
|
"loss": 5.4071,
|
|
"mean_token_accuracy": 0.16024885028600694,
|
|
"num_tokens": 27174070.0,
|
|
"step": 14735
|
|
},
|
|
{
|
|
"entropy": 5.772764015197754,
|
|
"epoch": 1.2383532871245537,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004851199202771219,
|
|
"loss": 5.483,
|
|
"mean_token_accuracy": 0.16502515822649003,
|
|
"num_tokens": 27182903.0,
|
|
"step": 14740
|
|
},
|
|
{
|
|
"entropy": 5.702397966384888,
|
|
"epoch": 1.2387733669397185,
|
|
"grad_norm": 1.7421875,
|
|
"learning_rate": 0.0004851092088622216,
|
|
"loss": 5.401,
|
|
"mean_token_accuracy": 0.16937888264656067,
|
|
"num_tokens": 27192747.0,
|
|
"step": 14745
|
|
},
|
|
{
|
|
"entropy": 5.664963483810425,
|
|
"epoch": 1.2391934467548835,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004850984937253448,
|
|
"loss": 5.3985,
|
|
"mean_token_accuracy": 0.16960911005735396,
|
|
"num_tokens": 27201657.0,
|
|
"step": 14750
|
|
},
|
|
{
|
|
"entropy": 5.632458209991455,
|
|
"epoch": 1.2396135265700483,
|
|
"grad_norm": 1.828125,
|
|
"learning_rate": 0.0004850877748666814,
|
|
"loss": 5.4448,
|
|
"mean_token_accuracy": 0.16701988875865936,
|
|
"num_tokens": 27211794.0,
|
|
"step": 14755
|
|
},
|
|
{
|
|
"entropy": 5.656436538696289,
|
|
"epoch": 1.240033606385213,
|
|
"grad_norm": 1.7109375,
|
|
"learning_rate": 0.00048507705228642117,
|
|
"loss": 5.4039,
|
|
"mean_token_accuracy": 0.16116943657398225,
|
|
"num_tokens": 27221852.0,
|
|
"step": 14760
|
|
},
|
|
{
|
|
"entropy": 5.649289464950561,
|
|
"epoch": 1.240453686200378,
|
|
"grad_norm": 1.5625,
|
|
"learning_rate": 0.0004850663259847542,
|
|
"loss": 5.4512,
|
|
"mean_token_accuracy": 0.16168087124824523,
|
|
"num_tokens": 27231558.0,
|
|
"step": 14765
|
|
},
|
|
{
|
|
"entropy": 5.6387444019317625,
|
|
"epoch": 1.240873766015543,
|
|
"grad_norm": 1.8515625,
|
|
"learning_rate": 0.00048505559596187037,
|
|
"loss": 5.4236,
|
|
"mean_token_accuracy": 0.16527110785245896,
|
|
"num_tokens": 27241053.0,
|
|
"step": 14770
|
|
},
|
|
{
|
|
"entropy": 5.636253309249878,
|
|
"epoch": 1.241293845830708,
|
|
"grad_norm": 3.015625,
|
|
"learning_rate": 0.0004850448622179599,
|
|
"loss": 5.3175,
|
|
"mean_token_accuracy": 0.16779372990131378,
|
|
"num_tokens": 27249770.0,
|
|
"step": 14775
|
|
},
|
|
{
|
|
"entropy": 5.738742828369141,
|
|
"epoch": 1.2417139256458727,
|
|
"grad_norm": 1.8203125,
|
|
"learning_rate": 0.0004850341247532128,
|
|
"loss": 5.562,
|
|
"mean_token_accuracy": 0.1625165030360222,
|
|
"num_tokens": 27258883.0,
|
|
"step": 14780
|
|
},
|
|
{
|
|
"entropy": 5.749982023239136,
|
|
"epoch": 1.2421340054610377,
|
|
"grad_norm": 1.890625,
|
|
"learning_rate": 0.0004850233835678194,
|
|
"loss": 5.4633,
|
|
"mean_token_accuracy": 0.1648506224155426,
|
|
"num_tokens": 27268056.0,
|
|
"step": 14785
|
|
},
|
|
{
|
|
"entropy": 5.663808441162109,
|
|
"epoch": 1.2425540852762025,
|
|
"grad_norm": 1.6953125,
|
|
"learning_rate": 0.0004850126386619699,
|
|
"loss": 5.3236,
|
|
"mean_token_accuracy": 0.17531862407922744,
|
|
"num_tokens": 27276965.0,
|
|
"step": 14790
|
|
},
|
|
{
|
|
"entropy": 5.587924385070801,
|
|
"epoch": 1.2429741650913673,
|
|
"grad_norm": 1.609375,
|
|
"learning_rate": 0.0004850018900358545,
|
|
"loss": 5.4055,
|
|
"mean_token_accuracy": 0.17150697410106658,
|
|
"num_tokens": 27286173.0,
|
|
"step": 14795
|
|
},
|
|
{
|
|
"entropy": 5.602390480041504,
|
|
"epoch": 1.2433942449065323,
|
|
"grad_norm": 2.203125,
|
|
"learning_rate": 0.00048499113768966386,
|
|
"loss": 5.4064,
|
|
"mean_token_accuracy": 0.16569943949580193,
|
|
"num_tokens": 27294863.0,
|
|
"step": 14800
|
|
},
|
|
{
|
|
"entropy": 5.70964732170105,
|
|
"epoch": 1.243814324721697,
|
|
"grad_norm": 1.921875,
|
|
"learning_rate": 0.0004849803816235884,
|
|
"loss": 5.4411,
|
|
"mean_token_accuracy": 0.16401146501302719,
|
|
"num_tokens": 27304427.0,
|
|
"step": 14805
|
|
},
|
|
{
|
|
"entropy": 5.745283937454223,
|
|
"epoch": 1.244234404536862,
|
|
"grad_norm": 1.78125,
|
|
"learning_rate": 0.0004849696218378185,
|
|
"loss": 5.4928,
|
|
"mean_token_accuracy": 0.16396332681179046,
|
|
"num_tokens": 27313716.0,
|
|
"step": 14810
|
|
},
|
|
{
|
|
"entropy": 5.715461683273316,
|
|
"epoch": 1.2446544843520269,
|
|
"grad_norm": 1.6328125,
|
|
"learning_rate": 0.0004849588583325449,
|
|
"loss": 5.3965,
|
|
"mean_token_accuracy": 0.17563249170780182,
|
|
"num_tokens": 27322342.0,
|
|
"step": 14815
|
|
},
|
|
{
|
|
"entropy": 5.7087640285491945,
|
|
"epoch": 1.2450745641671919,
|
|
"grad_norm": 1.6796875,
|
|
"learning_rate": 0.0004849480911079583,
|
|
"loss": 5.4743,
|
|
"mean_token_accuracy": 0.15280741229653358,
|
|
"num_tokens": 27331892.0,
|
|
"step": 14820
|
|
},
|
|
{
|
|
"entropy": 5.690652322769165,
|
|
"epoch": 1.2454946439823567,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004849373201642493,
|
|
"loss": 5.4503,
|
|
"mean_token_accuracy": 0.15473639070987702,
|
|
"num_tokens": 27340428.0,
|
|
"step": 14825
|
|
},
|
|
{
|
|
"entropy": 5.666205930709839,
|
|
"epoch": 1.2459147237975214,
|
|
"grad_norm": 3.359375,
|
|
"learning_rate": 0.0004849265455016088,
|
|
"loss": 5.4488,
|
|
"mean_token_accuracy": 0.16578658074140548,
|
|
"num_tokens": 27349224.0,
|
|
"step": 14830
|
|
},
|
|
{
|
|
"entropy": 5.656325817108154,
|
|
"epoch": 1.2463348036126864,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.0004849157671202277,
|
|
"loss": 5.4257,
|
|
"mean_token_accuracy": 0.16552276760339737,
|
|
"num_tokens": 27357480.0,
|
|
"step": 14835
|
|
},
|
|
{
|
|
"entropy": 5.63496470451355,
|
|
"epoch": 1.2467548834278512,
|
|
"grad_norm": 1.7578125,
|
|
"learning_rate": 0.0004849049850202968,
|
|
"loss": 5.3417,
|
|
"mean_token_accuracy": 0.17225963473320008,
|
|
"num_tokens": 27366732.0,
|
|
"step": 14840
|
|
},
|
|
{
|
|
"entropy": 5.706405782699585,
|
|
"epoch": 1.2471749632430162,
|
|
"grad_norm": 2.375,
|
|
"learning_rate": 0.0004848941992020072,
|
|
"loss": 5.4366,
|
|
"mean_token_accuracy": 0.16214968562126159,
|
|
"num_tokens": 27375834.0,
|
|
"step": 14845
|
|
},
|
|
{
|
|
"entropy": 5.714145278930664,
|
|
"epoch": 1.247595043058181,
|
|
"grad_norm": 1.4921875,
|
|
"learning_rate": 0.0004848834096655499,
|
|
"loss": 5.4407,
|
|
"mean_token_accuracy": 0.16688166558742523,
|
|
"num_tokens": 27385311.0,
|
|
"step": 14850
|
|
},
|
|
{
|
|
"entropy": 5.674659585952758,
|
|
"epoch": 1.2480151228733458,
|
|
"grad_norm": 2.125,
|
|
"learning_rate": 0.00048487261641111607,
|
|
"loss": 5.4804,
|
|
"mean_token_accuracy": 0.1633618250489235,
|
|
"num_tokens": 27394587.0,
|
|
"step": 14855
|
|
},
|
|
{
|
|
"entropy": 5.5959290027618405,
|
|
"epoch": 1.2484352026885108,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.000484861819438897,
|
|
"loss": 5.3548,
|
|
"mean_token_accuracy": 0.1634133994579315,
|
|
"num_tokens": 27403316.0,
|
|
"step": 14860
|
|
},
|
|
{
|
|
"entropy": 5.690115690231323,
|
|
"epoch": 1.2488552825036756,
|
|
"grad_norm": 2.015625,
|
|
"learning_rate": 0.0004848510187490838,
|
|
"loss": 5.4139,
|
|
"mean_token_accuracy": 0.16767150908708572,
|
|
"num_tokens": 27412709.0,
|
|
"step": 14865
|
|
},
|
|
{
|
|
"entropy": 5.697337675094604,
|
|
"epoch": 1.2492753623188406,
|
|
"grad_norm": 2.875,
|
|
"learning_rate": 0.0004848402143418679,
|
|
"loss": 5.4513,
|
|
"mean_token_accuracy": 0.16266510039567947,
|
|
"num_tokens": 27422004.0,
|
|
"step": 14870
|
|
},
|
|
{
|
|
"entropy": 5.71270170211792,
|
|
"epoch": 1.2496954421340054,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.00048482940621744053,
|
|
"loss": 5.492,
|
|
"mean_token_accuracy": 0.16200196892023086,
|
|
"num_tokens": 27431931.0,
|
|
"step": 14875
|
|
},
|
|
{
|
|
"entropy": 5.625197458267212,
|
|
"epoch": 1.2501155219491704,
|
|
"grad_norm": 1.5546875,
|
|
"learning_rate": 0.0004848185943759934,
|
|
"loss": 5.3174,
|
|
"mean_token_accuracy": 0.17668476104736328,
|
|
"num_tokens": 27441527.0,
|
|
"step": 14880
|
|
},
|
|
{
|
|
"entropy": 5.715439796447754,
|
|
"epoch": 1.2505356017643352,
|
|
"grad_norm": 1.75,
|
|
"learning_rate": 0.00048480777881771786,
|
|
"loss": 5.4617,
|
|
"mean_token_accuracy": 0.16427915468811988,
|
|
"num_tokens": 27449964.0,
|
|
"step": 14885
|
|
},
|
|
{
|
|
"entropy": 5.682178783416748,
|
|
"epoch": 1.2509556815795002,
|
|
"grad_norm": 1.9296875,
|
|
"learning_rate": 0.0004847969595428056,
|
|
"loss": 5.4647,
|
|
"mean_token_accuracy": 0.16355433017015458,
|
|
"num_tokens": 27459044.0,
|
|
"step": 14890
|
|
},
|
|
{
|
|
"entropy": 5.620614910125733,
|
|
"epoch": 1.251375761394665,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00048478613655144817,
|
|
"loss": 5.4447,
|
|
"mean_token_accuracy": 0.16300222128629685,
|
|
"num_tokens": 27467644.0,
|
|
"step": 14895
|
|
},
|
|
{
|
|
"entropy": 5.696579885482788,
|
|
"epoch": 1.2517958412098298,
|
|
"grad_norm": 2.0625,
|
|
"learning_rate": 0.0004847753098438374,
|
|
"loss": 5.4725,
|
|
"mean_token_accuracy": 0.1569360613822937,
|
|
"num_tokens": 27476899.0,
|
|
"step": 14900
|
|
},
|
|
{
|
|
"entropy": 5.6774050235748295,
|
|
"epoch": 1.2522159210249948,
|
|
"grad_norm": 1.515625,
|
|
"learning_rate": 0.000484764479420165,
|
|
"loss": 5.365,
|
|
"mean_token_accuracy": 0.17198057770729064,
|
|
"num_tokens": 27485167.0,
|
|
"step": 14905
|
|
},
|
|
{
|
|
"entropy": 5.703144598007202,
|
|
"epoch": 1.2526360008401596,
|
|
"grad_norm": 1.5390625,
|
|
"learning_rate": 0.00048475364528062287,
|
|
"loss": 5.4154,
|
|
"mean_token_accuracy": 0.16219851225614548,
|
|
"num_tokens": 27493986.0,
|
|
"step": 14910
|
|
},
|
|
{
|
|
"entropy": 5.72310881614685,
|
|
"epoch": 1.2530560806553246,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004847428074254029,
|
|
"loss": 5.4616,
|
|
"mean_token_accuracy": 0.16988417655229568,
|
|
"num_tokens": 27503896.0,
|
|
"step": 14915
|
|
},
|
|
{
|
|
"entropy": 5.662931871414185,
|
|
"epoch": 1.2534761604704894,
|
|
"grad_norm": 1.8359375,
|
|
"learning_rate": 0.00048473196585469713,
|
|
"loss": 5.4167,
|
|
"mean_token_accuracy": 0.16819938123226166,
|
|
"num_tokens": 27513485.0,
|
|
"step": 14920
|
|
},
|
|
{
|
|
"entropy": 5.721698808670044,
|
|
"epoch": 1.2538962402856542,
|
|
"grad_norm": 1.8125,
|
|
"learning_rate": 0.00048472112056869763,
|
|
"loss": 5.4664,
|
|
"mean_token_accuracy": 0.1657877966761589,
|
|
"num_tokens": 27523164.0,
|
|
"step": 14925
|
|
},
|
|
{
|
|
"entropy": 5.728442049026489,
|
|
"epoch": 1.2543163201008192,
|
|
"grad_norm": 1.53125,
|
|
"learning_rate": 0.0004847102715675964,
|
|
"loss": 5.4042,
|
|
"mean_token_accuracy": 0.16405912339687348,
|
|
"num_tokens": 27531387.0,
|
|
"step": 14930
|
|
},
|
|
{
|
|
"entropy": 5.639872455596924,
|
|
"epoch": 1.254736399915984,
|
|
"grad_norm": 1.4609375,
|
|
"learning_rate": 0.0004846994188515857,
|
|
"loss": 5.4354,
|
|
"mean_token_accuracy": 0.168145589530468,
|
|
"num_tokens": 27541754.0,
|
|
"step": 14935
|
|
},
|
|
{
|
|
"entropy": 5.789422988891602,
|
|
"epoch": 1.255156479731149,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004846885624208578,
|
|
"loss": 5.5008,
|
|
"mean_token_accuracy": 0.1617642045021057,
|
|
"num_tokens": 27551458.0,
|
|
"step": 14940
|
|
},
|
|
{
|
|
"entropy": 5.697876119613648,
|
|
"epoch": 1.2555765595463138,
|
|
"grad_norm": 1.5,
|
|
"learning_rate": 0.000484677702275605,
|
|
"loss": 5.4113,
|
|
"mean_token_accuracy": 0.170192214846611,
|
|
"num_tokens": 27560797.0,
|
|
"step": 14945
|
|
},
|
|
{
|
|
"entropy": 5.651928424835205,
|
|
"epoch": 1.2559966393614788,
|
|
"grad_norm": 1.7890625,
|
|
"learning_rate": 0.00048466683841601963,
|
|
"loss": 5.3958,
|
|
"mean_token_accuracy": 0.16396012008190156,
|
|
"num_tokens": 27570166.0,
|
|
"step": 14950
|
|
},
|
|
{
|
|
"entropy": 5.603499841690064,
|
|
"epoch": 1.2564167191766435,
|
|
"grad_norm": 1.546875,
|
|
"learning_rate": 0.00048465597084229416,
|
|
"loss": 5.308,
|
|
"mean_token_accuracy": 0.17143625617027283,
|
|
"num_tokens": 27579411.0,
|
|
"step": 14955
|
|
},
|
|
{
|
|
"entropy": 5.695507955551148,
|
|
"epoch": 1.2568367989918086,
|
|
"grad_norm": 1.484375,
|
|
"learning_rate": 0.0004846450995546212,
|
|
"loss": 5.5591,
|
|
"mean_token_accuracy": 0.15752519071102142,
|
|
"num_tokens": 27589124.0,
|
|
"step": 14960
|
|
},
|
|
{
|
|
"entropy": 5.721073627471924,
|
|
"epoch": 1.2572568788069733,
|
|
"grad_norm": 1.9375,
|
|
"learning_rate": 0.0004846342245531932,
|
|
"loss": 5.5081,
|
|
"mean_token_accuracy": 0.15465038716793061,
|
|
"num_tokens": 27598664.0,
|
|
"step": 14965
|
|
},
|
|
{
|
|
"entropy": 5.7139750003814695,
|
|
"epoch": 1.2576769586221381,
|
|
"grad_norm": 1.671875,
|
|
"learning_rate": 0.0004846233458382029,
|
|
"loss": 5.4358,
|
|
"mean_token_accuracy": 0.16635446548461913,
|
|
"num_tokens": 27607189.0,
|
|
"step": 14970
|
|
},
|
|
{
|
|
"entropy": 5.7070770263671875,
|
|
"epoch": 1.2580970384373031,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048461246340984293,
|
|
"loss": 5.4657,
|
|
"mean_token_accuracy": 0.16415384262800217,
|
|
"num_tokens": 27616415.0,
|
|
"step": 14975
|
|
},
|
|
{
|
|
"entropy": 5.698441743850708,
|
|
"epoch": 1.258517118252468,
|
|
"grad_norm": 1.6171875,
|
|
"learning_rate": 0.0004846015772683061,
|
|
"loss": 5.4595,
|
|
"mean_token_accuracy": 0.16718726754188537,
|
|
"num_tokens": 27624492.0,
|
|
"step": 14980
|
|
},
|
|
{
|
|
"entropy": 5.598542785644531,
|
|
"epoch": 1.258937198067633,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.00048459068741378526,
|
|
"loss": 5.3518,
|
|
"mean_token_accuracy": 0.1675553500652313,
|
|
"num_tokens": 27634243.0,
|
|
"step": 14985
|
|
},
|
|
{
|
|
"entropy": 5.647716331481933,
|
|
"epoch": 1.2593572778827977,
|
|
"grad_norm": 1.5234375,
|
|
"learning_rate": 0.0004845797938464734,
|
|
"loss": 5.4522,
|
|
"mean_token_accuracy": 0.16648911386728288,
|
|
"num_tokens": 27642887.0,
|
|
"step": 14990
|
|
},
|
|
{
|
|
"entropy": 5.7321514129638675,
|
|
"epoch": 1.2597773576979625,
|
|
"grad_norm": 1.6484375,
|
|
"learning_rate": 0.0004845688965665633,
|
|
"loss": 5.4703,
|
|
"mean_token_accuracy": 0.15985920876264573,
|
|
"num_tokens": 27652524.0,
|
|
"step": 14995
|
|
},
|
|
{
|
|
"entropy": 5.682502317428589,
|
|
"epoch": 1.2601974375131275,
|
|
"grad_norm": 1.625,
|
|
"learning_rate": 0.00048455799557424814,
|
|
"loss": 5.3173,
|
|
"mean_token_accuracy": 0.17918399572372437,
|
|
"num_tokens": 27661306.0,
|
|
"step": 15000
|
|
},
|
|
{
|
|
"epoch": 1.2601974375131275,
|
|
"eval_entropy": 5.539990809356291,
|
|
"eval_loss": 5.50443172454834,
|
|
"eval_mean_token_accuracy": 0.17097199404162772,
|
|
"eval_num_tokens": 27661306.0,
|
|
"eval_runtime": 27.3249,
|
|
"eval_samples_per_second": 1367.473,
|
|
"eval_steps_per_second": 170.943,
|
|
"step": 15000
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 119020,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 10,
|
|
"save_steps": 3000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.0484285319168e+16,
|
|
"train_batch_size": 16,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|