Files
Llama-3.1-8B/trainer_state.json
ModelHub XC 7000915984 初始化项目,由ModelHub XC社区提供模型
Model: yapeichang/Llama-3.1-8B
Source: Original Platform
2026-06-04 08:18:17 +08:00

14093 lines
346 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.99968,
"eval_steps": 500,
"global_step": 1562,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00064,
"grad_norm": 48.05089569091797,
"learning_rate": 0.0,
"loss": 1.4121,
"mean_token_accuracy": 0.6530859991908073,
"num_tokens": 13040.0,
"step": 1
},
{
"epoch": 0.00128,
"grad_norm": 26.102807998657227,
"learning_rate": 6.329113924050633e-08,
"loss": 1.5231,
"mean_token_accuracy": 0.6276459023356438,
"num_tokens": 26989.0,
"step": 2
},
{
"epoch": 0.00192,
"grad_norm": 95.80143737792969,
"learning_rate": 1.2658227848101266e-07,
"loss": 1.4079,
"mean_token_accuracy": 0.6467841640114784,
"num_tokens": 37679.0,
"step": 3
},
{
"epoch": 0.00256,
"grad_norm": 15.393555641174316,
"learning_rate": 1.89873417721519e-07,
"loss": 1.5839,
"mean_token_accuracy": 0.6144984066486359,
"num_tokens": 52503.0,
"step": 4
},
{
"epoch": 0.0032,
"grad_norm": 76.43201446533203,
"learning_rate": 2.5316455696202533e-07,
"loss": 1.5376,
"mean_token_accuracy": 0.614102203398943,
"num_tokens": 62165.0,
"step": 5
},
{
"epoch": 0.00384,
"grad_norm": 13.124272346496582,
"learning_rate": 3.164556962025317e-07,
"loss": 1.4651,
"mean_token_accuracy": 0.6233282685279846,
"num_tokens": 76003.0,
"step": 6
},
{
"epoch": 0.00448,
"grad_norm": 22.9676570892334,
"learning_rate": 3.79746835443038e-07,
"loss": 1.5145,
"mean_token_accuracy": 0.6122585535049438,
"num_tokens": 92265.0,
"step": 7
},
{
"epoch": 0.00512,
"grad_norm": 15.335000038146973,
"learning_rate": 4.4303797468354435e-07,
"loss": 1.4871,
"mean_token_accuracy": 0.6254958733916283,
"num_tokens": 106642.0,
"step": 8
},
{
"epoch": 0.00576,
"grad_norm": 13.938623428344727,
"learning_rate": 5.063291139240507e-07,
"loss": 1.2911,
"mean_token_accuracy": 0.6838521659374237,
"num_tokens": 118655.0,
"step": 9
},
{
"epoch": 0.0064,
"grad_norm": 12.838367462158203,
"learning_rate": 5.69620253164557e-07,
"loss": 1.5914,
"mean_token_accuracy": 0.6144495904445648,
"num_tokens": 133015.0,
"step": 10
},
{
"epoch": 0.00704,
"grad_norm": 28.111896514892578,
"learning_rate": 6.329113924050634e-07,
"loss": 1.5112,
"mean_token_accuracy": 0.6265368536114693,
"num_tokens": 147398.0,
"step": 11
},
{
"epoch": 0.00768,
"grad_norm": 86.63944244384766,
"learning_rate": 6.962025316455696e-07,
"loss": 1.3589,
"mean_token_accuracy": 0.6383277997374535,
"num_tokens": 158812.0,
"step": 12
},
{
"epoch": 0.00832,
"grad_norm": 168.6136016845703,
"learning_rate": 7.59493670886076e-07,
"loss": 1.7851,
"mean_token_accuracy": 0.5883053466677666,
"num_tokens": 170703.0,
"step": 13
},
{
"epoch": 0.00896,
"grad_norm": 217.70408630371094,
"learning_rate": 8.227848101265823e-07,
"loss": 1.4027,
"mean_token_accuracy": 0.6471100524067879,
"num_tokens": 184556.0,
"step": 14
},
{
"epoch": 0.0096,
"grad_norm": 90.14910125732422,
"learning_rate": 8.860759493670887e-07,
"loss": 1.376,
"mean_token_accuracy": 0.6501478627324104,
"num_tokens": 202523.0,
"step": 15
},
{
"epoch": 0.01024,
"grad_norm": 97.01738739013672,
"learning_rate": 9.493670886075951e-07,
"loss": 1.1443,
"mean_token_accuracy": 0.6894906312227249,
"num_tokens": 212189.0,
"step": 16
},
{
"epoch": 0.01088,
"grad_norm": 64.89408874511719,
"learning_rate": 1.0126582278481013e-06,
"loss": 1.5423,
"mean_token_accuracy": 0.6109918430447578,
"num_tokens": 226375.0,
"step": 17
},
{
"epoch": 0.01152,
"grad_norm": 82.11033630371094,
"learning_rate": 1.0759493670886077e-06,
"loss": 1.6578,
"mean_token_accuracy": 0.6166552416980267,
"num_tokens": 238636.0,
"step": 18
},
{
"epoch": 0.01216,
"grad_norm": 102.00513458251953,
"learning_rate": 1.139240506329114e-06,
"loss": 1.3764,
"mean_token_accuracy": 0.6483487188816071,
"num_tokens": 250678.0,
"step": 19
},
{
"epoch": 0.0128,
"grad_norm": 69.1679916381836,
"learning_rate": 1.2025316455696204e-06,
"loss": 1.5599,
"mean_token_accuracy": 0.6092639714479446,
"num_tokens": 266795.0,
"step": 20
},
{
"epoch": 0.01344,
"grad_norm": 138.91578674316406,
"learning_rate": 1.2658227848101267e-06,
"loss": 1.4293,
"mean_token_accuracy": 0.6301368102431297,
"num_tokens": 280316.0,
"step": 21
},
{
"epoch": 0.01408,
"grad_norm": 84.13861083984375,
"learning_rate": 1.3291139240506329e-06,
"loss": 1.5571,
"mean_token_accuracy": 0.6376139968633652,
"num_tokens": 292273.0,
"step": 22
},
{
"epoch": 0.01472,
"grad_norm": 75.20580291748047,
"learning_rate": 1.3924050632911392e-06,
"loss": 1.7514,
"mean_token_accuracy": 0.5854216478765011,
"num_tokens": 304665.0,
"step": 23
},
{
"epoch": 0.01536,
"grad_norm": 33.301204681396484,
"learning_rate": 1.4556962025316456e-06,
"loss": 1.445,
"mean_token_accuracy": 0.6291614323854446,
"num_tokens": 317582.0,
"step": 24
},
{
"epoch": 0.016,
"grad_norm": 49.913997650146484,
"learning_rate": 1.518987341772152e-06,
"loss": 1.1659,
"mean_token_accuracy": 0.6733755245804787,
"num_tokens": 330930.0,
"step": 25
},
{
"epoch": 0.01664,
"grad_norm": 16.423551559448242,
"learning_rate": 1.5822784810126585e-06,
"loss": 1.4189,
"mean_token_accuracy": 0.6274379268288612,
"num_tokens": 343897.0,
"step": 26
},
{
"epoch": 0.01728,
"grad_norm": 48.93357467651367,
"learning_rate": 1.6455696202531647e-06,
"loss": 1.5449,
"mean_token_accuracy": 0.6013954728841782,
"num_tokens": 358229.0,
"step": 27
},
{
"epoch": 0.01792,
"grad_norm": 41.11186981201172,
"learning_rate": 1.708860759493671e-06,
"loss": 1.4404,
"mean_token_accuracy": 0.6342460885643959,
"num_tokens": 369284.0,
"step": 28
},
{
"epoch": 0.01856,
"grad_norm": 32.76636505126953,
"learning_rate": 1.7721518987341774e-06,
"loss": 1.5705,
"mean_token_accuracy": 0.6268726661801338,
"num_tokens": 384919.0,
"step": 29
},
{
"epoch": 0.0192,
"grad_norm": 65.5134048461914,
"learning_rate": 1.8354430379746838e-06,
"loss": 1.4156,
"mean_token_accuracy": 0.6435664221644402,
"num_tokens": 399789.0,
"step": 30
},
{
"epoch": 0.01984,
"grad_norm": 6.874727725982666,
"learning_rate": 1.8987341772151901e-06,
"loss": 1.3794,
"mean_token_accuracy": 0.6731243506073952,
"num_tokens": 410300.0,
"step": 31
},
{
"epoch": 0.02048,
"grad_norm": 30.61371421813965,
"learning_rate": 1.9620253164556965e-06,
"loss": 1.4516,
"mean_token_accuracy": 0.6296539008617401,
"num_tokens": 425086.0,
"step": 32
},
{
"epoch": 0.02112,
"grad_norm": 53.762176513671875,
"learning_rate": 2.0253164556962026e-06,
"loss": 1.3964,
"mean_token_accuracy": 0.6357481107115746,
"num_tokens": 438978.0,
"step": 33
},
{
"epoch": 0.02176,
"grad_norm": 58.90037536621094,
"learning_rate": 2.088607594936709e-06,
"loss": 1.4799,
"mean_token_accuracy": 0.619662769138813,
"num_tokens": 451824.0,
"step": 34
},
{
"epoch": 0.0224,
"grad_norm": 8.633367538452148,
"learning_rate": 2.1518987341772153e-06,
"loss": 1.4459,
"mean_token_accuracy": 0.6490786001086235,
"num_tokens": 465569.0,
"step": 35
},
{
"epoch": 0.02304,
"grad_norm": 152.17538452148438,
"learning_rate": 2.2151898734177215e-06,
"loss": 1.5855,
"mean_token_accuracy": 0.5879618301987648,
"num_tokens": 476187.0,
"step": 36
},
{
"epoch": 0.02368,
"grad_norm": 63.17410659790039,
"learning_rate": 2.278481012658228e-06,
"loss": 1.4941,
"mean_token_accuracy": 0.6467924416065216,
"num_tokens": 490658.0,
"step": 37
},
{
"epoch": 0.02432,
"grad_norm": 40.93563461303711,
"learning_rate": 2.341772151898734e-06,
"loss": 1.2798,
"mean_token_accuracy": 0.6463187485933304,
"num_tokens": 500707.0,
"step": 38
},
{
"epoch": 0.02496,
"grad_norm": 47.74807357788086,
"learning_rate": 2.4050632911392408e-06,
"loss": 1.682,
"mean_token_accuracy": 0.5752647258341312,
"num_tokens": 514924.0,
"step": 39
},
{
"epoch": 0.0256,
"grad_norm": 8.83420181274414,
"learning_rate": 2.4683544303797473e-06,
"loss": 1.374,
"mean_token_accuracy": 0.6248214021325111,
"num_tokens": 529540.0,
"step": 40
},
{
"epoch": 0.02624,
"grad_norm": 73.02564239501953,
"learning_rate": 2.5316455696202535e-06,
"loss": 1.4803,
"mean_token_accuracy": 0.63168865442276,
"num_tokens": 542779.0,
"step": 41
},
{
"epoch": 0.02688,
"grad_norm": 58.30765914916992,
"learning_rate": 2.5949367088607596e-06,
"loss": 1.2061,
"mean_token_accuracy": 0.6671500578522682,
"num_tokens": 556396.0,
"step": 42
},
{
"epoch": 0.02752,
"grad_norm": 33.67079162597656,
"learning_rate": 2.6582278481012658e-06,
"loss": 1.522,
"mean_token_accuracy": 0.5986876226961613,
"num_tokens": 569089.0,
"step": 43
},
{
"epoch": 0.02816,
"grad_norm": 31.859474182128906,
"learning_rate": 2.7215189873417724e-06,
"loss": 1.3638,
"mean_token_accuracy": 0.6643245741724968,
"num_tokens": 583380.0,
"step": 44
},
{
"epoch": 0.0288,
"grad_norm": 33.59089660644531,
"learning_rate": 2.7848101265822785e-06,
"loss": 1.4956,
"mean_token_accuracy": 0.6372537463903427,
"num_tokens": 596014.0,
"step": 45
},
{
"epoch": 0.02944,
"grad_norm": 25.843647003173828,
"learning_rate": 2.848101265822785e-06,
"loss": 1.1846,
"mean_token_accuracy": 0.6587028503417969,
"num_tokens": 608555.0,
"step": 46
},
{
"epoch": 0.03008,
"grad_norm": 5.101419925689697,
"learning_rate": 2.9113924050632912e-06,
"loss": 1.3947,
"mean_token_accuracy": 0.6413091421127319,
"num_tokens": 621134.0,
"step": 47
},
{
"epoch": 0.03072,
"grad_norm": 40.20245361328125,
"learning_rate": 2.9746835443037974e-06,
"loss": 1.345,
"mean_token_accuracy": 0.6560010835528374,
"num_tokens": 635852.0,
"step": 48
},
{
"epoch": 0.03136,
"grad_norm": 94.18912506103516,
"learning_rate": 3.037974683544304e-06,
"loss": 1.3185,
"mean_token_accuracy": 0.6504691988229752,
"num_tokens": 648238.0,
"step": 49
},
{
"epoch": 0.032,
"grad_norm": 23.04238510131836,
"learning_rate": 3.10126582278481e-06,
"loss": 1.5373,
"mean_token_accuracy": 0.6217592805624008,
"num_tokens": 662824.0,
"step": 50
},
{
"epoch": 0.03264,
"grad_norm": 56.0700569152832,
"learning_rate": 3.164556962025317e-06,
"loss": 1.2922,
"mean_token_accuracy": 0.6528200656175613,
"num_tokens": 675378.0,
"step": 51
},
{
"epoch": 0.03328,
"grad_norm": 14.78956127166748,
"learning_rate": 3.2278481012658232e-06,
"loss": 1.5514,
"mean_token_accuracy": 0.6194410622119904,
"num_tokens": 689687.0,
"step": 52
},
{
"epoch": 0.03392,
"grad_norm": 52.21746063232422,
"learning_rate": 3.2911392405063294e-06,
"loss": 1.5048,
"mean_token_accuracy": 0.6074254661798477,
"num_tokens": 702196.0,
"step": 53
},
{
"epoch": 0.03456,
"grad_norm": 39.832069396972656,
"learning_rate": 3.354430379746836e-06,
"loss": 1.2862,
"mean_token_accuracy": 0.6434847190976143,
"num_tokens": 716114.0,
"step": 54
},
{
"epoch": 0.0352,
"grad_norm": 13.459836959838867,
"learning_rate": 3.417721518987342e-06,
"loss": 1.6001,
"mean_token_accuracy": 0.6024078205227852,
"num_tokens": 729970.0,
"step": 55
},
{
"epoch": 0.03584,
"grad_norm": 7.002089023590088,
"learning_rate": 3.4810126582278487e-06,
"loss": 1.2902,
"mean_token_accuracy": 0.6684707179665565,
"num_tokens": 742129.0,
"step": 56
},
{
"epoch": 0.03648,
"grad_norm": 5.5110907554626465,
"learning_rate": 3.544303797468355e-06,
"loss": 1.287,
"mean_token_accuracy": 0.6572533845901489,
"num_tokens": 756207.0,
"step": 57
},
{
"epoch": 0.03712,
"grad_norm": 19.13697052001953,
"learning_rate": 3.607594936708861e-06,
"loss": 1.3056,
"mean_token_accuracy": 0.6451508551836014,
"num_tokens": 768845.0,
"step": 58
},
{
"epoch": 0.03776,
"grad_norm": 15.833284378051758,
"learning_rate": 3.6708860759493675e-06,
"loss": 1.738,
"mean_token_accuracy": 0.5944546982645988,
"num_tokens": 780788.0,
"step": 59
},
{
"epoch": 0.0384,
"grad_norm": 20.870206832885742,
"learning_rate": 3.7341772151898737e-06,
"loss": 1.4071,
"mean_token_accuracy": 0.6308668106794357,
"num_tokens": 793426.0,
"step": 60
},
{
"epoch": 0.03904,
"grad_norm": 16.34093475341797,
"learning_rate": 3.7974683544303802e-06,
"loss": 1.5468,
"mean_token_accuracy": 0.627624161541462,
"num_tokens": 804663.0,
"step": 61
},
{
"epoch": 0.03968,
"grad_norm": 17.309402465820312,
"learning_rate": 3.860759493670886e-06,
"loss": 1.505,
"mean_token_accuracy": 0.6210604161024094,
"num_tokens": 818199.0,
"step": 62
},
{
"epoch": 0.04032,
"grad_norm": 4.618063449859619,
"learning_rate": 3.924050632911393e-06,
"loss": 1.2564,
"mean_token_accuracy": 0.664100281894207,
"num_tokens": 830056.0,
"step": 63
},
{
"epoch": 0.04096,
"grad_norm": 61.135398864746094,
"learning_rate": 3.9873417721518995e-06,
"loss": 1.2191,
"mean_token_accuracy": 0.6882363706827164,
"num_tokens": 844140.0,
"step": 64
},
{
"epoch": 0.0416,
"grad_norm": 58.18097686767578,
"learning_rate": 4.050632911392405e-06,
"loss": 1.4123,
"mean_token_accuracy": 0.6319537982344627,
"num_tokens": 856421.0,
"step": 65
},
{
"epoch": 0.04224,
"grad_norm": 4.918764591217041,
"learning_rate": 4.113924050632912e-06,
"loss": 1.342,
"mean_token_accuracy": 0.6437618285417557,
"num_tokens": 870219.0,
"step": 66
},
{
"epoch": 0.04288,
"grad_norm": 6.8447585105896,
"learning_rate": 4.177215189873418e-06,
"loss": 1.2137,
"mean_token_accuracy": 0.6827035769820213,
"num_tokens": 880524.0,
"step": 67
},
{
"epoch": 0.04352,
"grad_norm": 12.660514831542969,
"learning_rate": 4.240506329113924e-06,
"loss": 1.4528,
"mean_token_accuracy": 0.6331579238176346,
"num_tokens": 891958.0,
"step": 68
},
{
"epoch": 0.04416,
"grad_norm": 7.19080924987793,
"learning_rate": 4.303797468354431e-06,
"loss": 1.4252,
"mean_token_accuracy": 0.65432970225811,
"num_tokens": 904908.0,
"step": 69
},
{
"epoch": 0.0448,
"grad_norm": 3.9678971767425537,
"learning_rate": 4.367088607594937e-06,
"loss": 1.2971,
"mean_token_accuracy": 0.6341002583503723,
"num_tokens": 918681.0,
"step": 70
},
{
"epoch": 0.04544,
"grad_norm": 7.076503276824951,
"learning_rate": 4.430379746835443e-06,
"loss": 1.2615,
"mean_token_accuracy": 0.6656483858823776,
"num_tokens": 928006.0,
"step": 71
},
{
"epoch": 0.04608,
"grad_norm": 5.339262962341309,
"learning_rate": 4.4936708860759495e-06,
"loss": 1.344,
"mean_token_accuracy": 0.6466359868645668,
"num_tokens": 940426.0,
"step": 72
},
{
"epoch": 0.04672,
"grad_norm": 7.0051703453063965,
"learning_rate": 4.556962025316456e-06,
"loss": 1.4216,
"mean_token_accuracy": 0.630195863544941,
"num_tokens": 954324.0,
"step": 73
},
{
"epoch": 0.04736,
"grad_norm": 4.788408279418945,
"learning_rate": 4.620253164556963e-06,
"loss": 1.4698,
"mean_token_accuracy": 0.614908404648304,
"num_tokens": 967599.0,
"step": 74
},
{
"epoch": 0.048,
"grad_norm": 5.4968366622924805,
"learning_rate": 4.683544303797468e-06,
"loss": 1.3819,
"mean_token_accuracy": 0.6416184306144714,
"num_tokens": 981616.0,
"step": 75
},
{
"epoch": 0.04864,
"grad_norm": 4.646515369415283,
"learning_rate": 4.746835443037975e-06,
"loss": 1.4479,
"mean_token_accuracy": 0.6559240221977234,
"num_tokens": 993440.0,
"step": 76
},
{
"epoch": 0.04928,
"grad_norm": 5.726164817810059,
"learning_rate": 4.8101265822784815e-06,
"loss": 1.3377,
"mean_token_accuracy": 0.6394649744033813,
"num_tokens": 1005930.0,
"step": 77
},
{
"epoch": 0.04992,
"grad_norm": 7.1859588623046875,
"learning_rate": 4.873417721518987e-06,
"loss": 1.5267,
"mean_token_accuracy": 0.6308169737458229,
"num_tokens": 1018113.0,
"step": 78
},
{
"epoch": 0.05056,
"grad_norm": 5.919410705566406,
"learning_rate": 4.936708860759495e-06,
"loss": 1.5063,
"mean_token_accuracy": 0.6113156750798225,
"num_tokens": 1030828.0,
"step": 79
},
{
"epoch": 0.0512,
"grad_norm": 4.142421722412109,
"learning_rate": 5e-06,
"loss": 1.365,
"mean_token_accuracy": 0.6484999246895313,
"num_tokens": 1045699.0,
"step": 80
},
{
"epoch": 0.05184,
"grad_norm": 5.720826625823975,
"learning_rate": 5e-06,
"loss": 1.324,
"mean_token_accuracy": 0.6485482379794121,
"num_tokens": 1060409.0,
"step": 81
},
{
"epoch": 0.05248,
"grad_norm": 4.827797889709473,
"learning_rate": 5e-06,
"loss": 1.2039,
"mean_token_accuracy": 0.6779980957508087,
"num_tokens": 1072000.0,
"step": 82
},
{
"epoch": 0.05312,
"grad_norm": 4.712104797363281,
"learning_rate": 5e-06,
"loss": 1.5722,
"mean_token_accuracy": 0.6242717280983925,
"num_tokens": 1085195.0,
"step": 83
},
{
"epoch": 0.05376,
"grad_norm": 4.540091514587402,
"learning_rate": 5e-06,
"loss": 1.5155,
"mean_token_accuracy": 0.633654311299324,
"num_tokens": 1099571.0,
"step": 84
},
{
"epoch": 0.0544,
"grad_norm": 5.3648905754089355,
"learning_rate": 5e-06,
"loss": 1.4137,
"mean_token_accuracy": 0.6268021315336227,
"num_tokens": 1114063.0,
"step": 85
},
{
"epoch": 0.05504,
"grad_norm": 4.212844371795654,
"learning_rate": 5e-06,
"loss": 1.3986,
"mean_token_accuracy": 0.638413742184639,
"num_tokens": 1128456.0,
"step": 86
},
{
"epoch": 0.05568,
"grad_norm": 4.77896785736084,
"learning_rate": 5e-06,
"loss": 1.3913,
"mean_token_accuracy": 0.6454877629876137,
"num_tokens": 1141977.0,
"step": 87
},
{
"epoch": 0.05632,
"grad_norm": 6.540133953094482,
"learning_rate": 5e-06,
"loss": 1.1014,
"mean_token_accuracy": 0.6967510804533958,
"num_tokens": 1154057.0,
"step": 88
},
{
"epoch": 0.05696,
"grad_norm": 3.7844600677490234,
"learning_rate": 5e-06,
"loss": 1.4211,
"mean_token_accuracy": 0.6254619807004929,
"num_tokens": 1169061.0,
"step": 89
},
{
"epoch": 0.0576,
"grad_norm": 3.695892810821533,
"learning_rate": 5e-06,
"loss": 1.2861,
"mean_token_accuracy": 0.6654118373990059,
"num_tokens": 1182796.0,
"step": 90
},
{
"epoch": 0.05824,
"grad_norm": 4.524760723114014,
"learning_rate": 5e-06,
"loss": 1.5883,
"mean_token_accuracy": 0.6046858802437782,
"num_tokens": 1196449.0,
"step": 91
},
{
"epoch": 0.05888,
"grad_norm": 5.951873779296875,
"learning_rate": 5e-06,
"loss": 1.3191,
"mean_token_accuracy": 0.6647541224956512,
"num_tokens": 1209472.0,
"step": 92
},
{
"epoch": 0.05952,
"grad_norm": 5.607054233551025,
"learning_rate": 5e-06,
"loss": 1.5771,
"mean_token_accuracy": 0.6196302324533463,
"num_tokens": 1222720.0,
"step": 93
},
{
"epoch": 0.06016,
"grad_norm": 4.97398567199707,
"learning_rate": 5e-06,
"loss": 1.2972,
"mean_token_accuracy": 0.6596869081258774,
"num_tokens": 1236366.0,
"step": 94
},
{
"epoch": 0.0608,
"grad_norm": 5.066143035888672,
"learning_rate": 5e-06,
"loss": 1.6685,
"mean_token_accuracy": 0.6038380563259125,
"num_tokens": 1248615.0,
"step": 95
},
{
"epoch": 0.06144,
"grad_norm": 4.967097282409668,
"learning_rate": 5e-06,
"loss": 1.3559,
"mean_token_accuracy": 0.649577222764492,
"num_tokens": 1258892.0,
"step": 96
},
{
"epoch": 0.06208,
"grad_norm": 3.9898176193237305,
"learning_rate": 5e-06,
"loss": 1.1218,
"mean_token_accuracy": 0.6794590428471565,
"num_tokens": 1272167.0,
"step": 97
},
{
"epoch": 0.06272,
"grad_norm": 4.856038570404053,
"learning_rate": 5e-06,
"loss": 1.4458,
"mean_token_accuracy": 0.6120704114437103,
"num_tokens": 1284435.0,
"step": 98
},
{
"epoch": 0.06336,
"grad_norm": 4.787650108337402,
"learning_rate": 5e-06,
"loss": 1.1262,
"mean_token_accuracy": 0.7047765105962753,
"num_tokens": 1295683.0,
"step": 99
},
{
"epoch": 0.064,
"grad_norm": 4.880126953125,
"learning_rate": 5e-06,
"loss": 1.359,
"mean_token_accuracy": 0.6566307917237282,
"num_tokens": 1309253.0,
"step": 100
},
{
"epoch": 0.06464,
"grad_norm": 4.704743385314941,
"learning_rate": 5e-06,
"loss": 1.3073,
"mean_token_accuracy": 0.6889312416315079,
"num_tokens": 1321838.0,
"step": 101
},
{
"epoch": 0.06528,
"grad_norm": 4.521302700042725,
"learning_rate": 5e-06,
"loss": 1.199,
"mean_token_accuracy": 0.6745252087712288,
"num_tokens": 1333677.0,
"step": 102
},
{
"epoch": 0.06592,
"grad_norm": 4.4061689376831055,
"learning_rate": 5e-06,
"loss": 1.2863,
"mean_token_accuracy": 0.6601276621222496,
"num_tokens": 1345912.0,
"step": 103
},
{
"epoch": 0.06656,
"grad_norm": 4.12923002243042,
"learning_rate": 5e-06,
"loss": 1.3052,
"mean_token_accuracy": 0.6593477055430412,
"num_tokens": 1356535.0,
"step": 104
},
{
"epoch": 0.0672,
"grad_norm": 4.265780448913574,
"learning_rate": 5e-06,
"loss": 1.5341,
"mean_token_accuracy": 0.6348154991865158,
"num_tokens": 1368522.0,
"step": 105
},
{
"epoch": 0.06784,
"grad_norm": 4.388949394226074,
"learning_rate": 5e-06,
"loss": 1.3616,
"mean_token_accuracy": 0.6796349138021469,
"num_tokens": 1381247.0,
"step": 106
},
{
"epoch": 0.06848,
"grad_norm": 4.523592948913574,
"learning_rate": 5e-06,
"loss": 1.4017,
"mean_token_accuracy": 0.638551875948906,
"num_tokens": 1392378.0,
"step": 107
},
{
"epoch": 0.06912,
"grad_norm": 4.722465991973877,
"learning_rate": 5e-06,
"loss": 1.1751,
"mean_token_accuracy": 0.6694681495428085,
"num_tokens": 1404081.0,
"step": 108
},
{
"epoch": 0.06976,
"grad_norm": 3.7663962841033936,
"learning_rate": 5e-06,
"loss": 1.2044,
"mean_token_accuracy": 0.6716165691614151,
"num_tokens": 1417942.0,
"step": 109
},
{
"epoch": 0.0704,
"grad_norm": 3.8090057373046875,
"learning_rate": 5e-06,
"loss": 1.4822,
"mean_token_accuracy": 0.6303943246603012,
"num_tokens": 1429633.0,
"step": 110
},
{
"epoch": 0.07104,
"grad_norm": 4.707150936126709,
"learning_rate": 5e-06,
"loss": 1.4048,
"mean_token_accuracy": 0.661470353603363,
"num_tokens": 1439918.0,
"step": 111
},
{
"epoch": 0.07168,
"grad_norm": 4.384817600250244,
"learning_rate": 5e-06,
"loss": 1.3427,
"mean_token_accuracy": 0.6572804600000381,
"num_tokens": 1451652.0,
"step": 112
},
{
"epoch": 0.07232,
"grad_norm": 3.9072980880737305,
"learning_rate": 5e-06,
"loss": 1.3317,
"mean_token_accuracy": 0.6496234610676765,
"num_tokens": 1466346.0,
"step": 113
},
{
"epoch": 0.07296,
"grad_norm": 3.8167197704315186,
"learning_rate": 5e-06,
"loss": 1.5108,
"mean_token_accuracy": 0.619833417236805,
"num_tokens": 1482241.0,
"step": 114
},
{
"epoch": 0.0736,
"grad_norm": 3.857537031173706,
"learning_rate": 5e-06,
"loss": 1.3917,
"mean_token_accuracy": 0.6645993143320084,
"num_tokens": 1495196.0,
"step": 115
},
{
"epoch": 0.07424,
"grad_norm": 4.024837970733643,
"learning_rate": 5e-06,
"loss": 1.3252,
"mean_token_accuracy": 0.6474068984389305,
"num_tokens": 1508711.0,
"step": 116
},
{
"epoch": 0.07488,
"grad_norm": 3.6451432704925537,
"learning_rate": 5e-06,
"loss": 1.3883,
"mean_token_accuracy": 0.658332034945488,
"num_tokens": 1521622.0,
"step": 117
},
{
"epoch": 0.07552,
"grad_norm": 3.7489166259765625,
"learning_rate": 5e-06,
"loss": 1.4337,
"mean_token_accuracy": 0.6373646706342697,
"num_tokens": 1536109.0,
"step": 118
},
{
"epoch": 0.07616,
"grad_norm": 4.419317245483398,
"learning_rate": 5e-06,
"loss": 1.6063,
"mean_token_accuracy": 0.5973443016409874,
"num_tokens": 1548521.0,
"step": 119
},
{
"epoch": 0.0768,
"grad_norm": 3.8151636123657227,
"learning_rate": 5e-06,
"loss": 1.2434,
"mean_token_accuracy": 0.6510246470570564,
"num_tokens": 1562145.0,
"step": 120
},
{
"epoch": 0.07744,
"grad_norm": 4.26577091217041,
"learning_rate": 5e-06,
"loss": 1.5936,
"mean_token_accuracy": 0.6028061434626579,
"num_tokens": 1575331.0,
"step": 121
},
{
"epoch": 0.07808,
"grad_norm": 4.482457637786865,
"learning_rate": 5e-06,
"loss": 1.3789,
"mean_token_accuracy": 0.6534934043884277,
"num_tokens": 1587643.0,
"step": 122
},
{
"epoch": 0.07872,
"grad_norm": 3.56472110748291,
"learning_rate": 5e-06,
"loss": 1.4365,
"mean_token_accuracy": 0.636934220790863,
"num_tokens": 1602307.0,
"step": 123
},
{
"epoch": 0.07936,
"grad_norm": 3.643859386444092,
"learning_rate": 5e-06,
"loss": 1.3166,
"mean_token_accuracy": 0.6632036790251732,
"num_tokens": 1616771.0,
"step": 124
},
{
"epoch": 0.08,
"grad_norm": 3.907698154449463,
"learning_rate": 5e-06,
"loss": 1.347,
"mean_token_accuracy": 0.6501271575689316,
"num_tokens": 1628788.0,
"step": 125
},
{
"epoch": 0.08064,
"grad_norm": 3.952827215194702,
"learning_rate": 5e-06,
"loss": 1.5638,
"mean_token_accuracy": 0.6289772987365723,
"num_tokens": 1643292.0,
"step": 126
},
{
"epoch": 0.08128,
"grad_norm": 3.829796314239502,
"learning_rate": 5e-06,
"loss": 1.4867,
"mean_token_accuracy": 0.6409079134464264,
"num_tokens": 1657859.0,
"step": 127
},
{
"epoch": 0.08192,
"grad_norm": 3.4832980632781982,
"learning_rate": 5e-06,
"loss": 1.1756,
"mean_token_accuracy": 0.6971960365772247,
"num_tokens": 1672891.0,
"step": 128
},
{
"epoch": 0.08256,
"grad_norm": 4.326021671295166,
"learning_rate": 5e-06,
"loss": 1.3608,
"mean_token_accuracy": 0.663967490196228,
"num_tokens": 1685243.0,
"step": 129
},
{
"epoch": 0.0832,
"grad_norm": 3.8590521812438965,
"learning_rate": 5e-06,
"loss": 1.3535,
"mean_token_accuracy": 0.6475187167525291,
"num_tokens": 1699220.0,
"step": 130
},
{
"epoch": 0.08384,
"grad_norm": 4.005199432373047,
"learning_rate": 5e-06,
"loss": 1.5247,
"mean_token_accuracy": 0.6049772128462791,
"num_tokens": 1711827.0,
"step": 131
},
{
"epoch": 0.08448,
"grad_norm": 5.4232378005981445,
"learning_rate": 5e-06,
"loss": 1.4393,
"mean_token_accuracy": 0.6452240273356438,
"num_tokens": 1722307.0,
"step": 132
},
{
"epoch": 0.08512,
"grad_norm": 3.7561964988708496,
"learning_rate": 5e-06,
"loss": 1.3973,
"mean_token_accuracy": 0.643314465880394,
"num_tokens": 1735760.0,
"step": 133
},
{
"epoch": 0.08576,
"grad_norm": 4.557453155517578,
"learning_rate": 5e-06,
"loss": 1.4625,
"mean_token_accuracy": 0.6588743627071381,
"num_tokens": 1749840.0,
"step": 134
},
{
"epoch": 0.0864,
"grad_norm": 4.375631809234619,
"learning_rate": 5e-06,
"loss": 1.3369,
"mean_token_accuracy": 0.6503657773137093,
"num_tokens": 1764508.0,
"step": 135
},
{
"epoch": 0.08704,
"grad_norm": 3.6710991859436035,
"learning_rate": 5e-06,
"loss": 1.3455,
"mean_token_accuracy": 0.6592239439487457,
"num_tokens": 1777991.0,
"step": 136
},
{
"epoch": 0.08768,
"grad_norm": 4.055100440979004,
"learning_rate": 5e-06,
"loss": 1.2004,
"mean_token_accuracy": 0.6970224753022194,
"num_tokens": 1790908.0,
"step": 137
},
{
"epoch": 0.08832,
"grad_norm": 3.4759104251861572,
"learning_rate": 5e-06,
"loss": 1.3996,
"mean_token_accuracy": 0.6468167528510094,
"num_tokens": 1807180.0,
"step": 138
},
{
"epoch": 0.08896,
"grad_norm": 4.201884746551514,
"learning_rate": 5e-06,
"loss": 1.36,
"mean_token_accuracy": 0.645078033208847,
"num_tokens": 1819408.0,
"step": 139
},
{
"epoch": 0.0896,
"grad_norm": 4.253586769104004,
"learning_rate": 5e-06,
"loss": 1.555,
"mean_token_accuracy": 0.6383631229400635,
"num_tokens": 1832773.0,
"step": 140
},
{
"epoch": 0.09024,
"grad_norm": 3.354541063308716,
"learning_rate": 5e-06,
"loss": 1.2203,
"mean_token_accuracy": 0.6814669519662857,
"num_tokens": 1848152.0,
"step": 141
},
{
"epoch": 0.09088,
"grad_norm": 3.436411142349243,
"learning_rate": 5e-06,
"loss": 1.4883,
"mean_token_accuracy": 0.6492328196763992,
"num_tokens": 1863701.0,
"step": 142
},
{
"epoch": 0.09152,
"grad_norm": 4.413644790649414,
"learning_rate": 5e-06,
"loss": 1.0295,
"mean_token_accuracy": 0.7075678631663322,
"num_tokens": 1872939.0,
"step": 143
},
{
"epoch": 0.09216,
"grad_norm": 5.079326152801514,
"learning_rate": 5e-06,
"loss": 1.3153,
"mean_token_accuracy": 0.6815094351768494,
"num_tokens": 1881487.0,
"step": 144
},
{
"epoch": 0.0928,
"grad_norm": 4.065243721008301,
"learning_rate": 5e-06,
"loss": 1.3596,
"mean_token_accuracy": 0.6275613754987717,
"num_tokens": 1892857.0,
"step": 145
},
{
"epoch": 0.09344,
"grad_norm": 3.9777028560638428,
"learning_rate": 5e-06,
"loss": 1.1708,
"mean_token_accuracy": 0.7008514180779457,
"num_tokens": 1905468.0,
"step": 146
},
{
"epoch": 0.09408,
"grad_norm": 3.9590489864349365,
"learning_rate": 5e-06,
"loss": 1.3645,
"mean_token_accuracy": 0.645123079419136,
"num_tokens": 1921339.0,
"step": 147
},
{
"epoch": 0.09472,
"grad_norm": 4.232624053955078,
"learning_rate": 5e-06,
"loss": 1.4376,
"mean_token_accuracy": 0.6167034581303596,
"num_tokens": 1933922.0,
"step": 148
},
{
"epoch": 0.09536,
"grad_norm": 4.538359642028809,
"learning_rate": 5e-06,
"loss": 1.2695,
"mean_token_accuracy": 0.6443182751536369,
"num_tokens": 1946327.0,
"step": 149
},
{
"epoch": 0.096,
"grad_norm": 3.987658977508545,
"learning_rate": 5e-06,
"loss": 1.165,
"mean_token_accuracy": 0.700744241476059,
"num_tokens": 1958145.0,
"step": 150
},
{
"epoch": 0.09664,
"grad_norm": 5.451640605926514,
"learning_rate": 5e-06,
"loss": 1.1847,
"mean_token_accuracy": 0.6844369322061539,
"num_tokens": 1968466.0,
"step": 151
},
{
"epoch": 0.09728,
"grad_norm": 3.7554731369018555,
"learning_rate": 5e-06,
"loss": 1.3363,
"mean_token_accuracy": 0.6585431769490242,
"num_tokens": 1981386.0,
"step": 152
},
{
"epoch": 0.09792,
"grad_norm": 3.601236581802368,
"learning_rate": 5e-06,
"loss": 1.3988,
"mean_token_accuracy": 0.6534986943006516,
"num_tokens": 1995319.0,
"step": 153
},
{
"epoch": 0.09856,
"grad_norm": 3.569467306137085,
"learning_rate": 5e-06,
"loss": 1.1972,
"mean_token_accuracy": 0.6820317879319191,
"num_tokens": 2008019.0,
"step": 154
},
{
"epoch": 0.0992,
"grad_norm": 3.896125078201294,
"learning_rate": 5e-06,
"loss": 1.3651,
"mean_token_accuracy": 0.6400049701333046,
"num_tokens": 2021300.0,
"step": 155
},
{
"epoch": 0.09984,
"grad_norm": 3.486210584640503,
"learning_rate": 5e-06,
"loss": 1.3398,
"mean_token_accuracy": 0.6543328985571861,
"num_tokens": 2033964.0,
"step": 156
},
{
"epoch": 0.10048,
"grad_norm": 3.03397274017334,
"learning_rate": 5e-06,
"loss": 1.4379,
"mean_token_accuracy": 0.6368994787335396,
"num_tokens": 2051392.0,
"step": 157
},
{
"epoch": 0.10112,
"grad_norm": 3.8133559226989746,
"learning_rate": 5e-06,
"loss": 1.4191,
"mean_token_accuracy": 0.6660285517573357,
"num_tokens": 2063899.0,
"step": 158
},
{
"epoch": 0.10176,
"grad_norm": 2.894871234893799,
"learning_rate": 5e-06,
"loss": 1.149,
"mean_token_accuracy": 0.687338799238205,
"num_tokens": 2081505.0,
"step": 159
},
{
"epoch": 0.1024,
"grad_norm": 4.369359016418457,
"learning_rate": 5e-06,
"loss": 1.3478,
"mean_token_accuracy": 0.6496308445930481,
"num_tokens": 2092604.0,
"step": 160
},
{
"epoch": 0.10304,
"grad_norm": 4.516582489013672,
"learning_rate": 5e-06,
"loss": 1.3535,
"mean_token_accuracy": 0.6436882838606834,
"num_tokens": 2103256.0,
"step": 161
},
{
"epoch": 0.10368,
"grad_norm": 3.317488431930542,
"learning_rate": 5e-06,
"loss": 1.3131,
"mean_token_accuracy": 0.6673252284526825,
"num_tokens": 2119060.0,
"step": 162
},
{
"epoch": 0.10432,
"grad_norm": 4.195248603820801,
"learning_rate": 5e-06,
"loss": 1.5371,
"mean_token_accuracy": 0.6269242167472839,
"num_tokens": 2131564.0,
"step": 163
},
{
"epoch": 0.10496,
"grad_norm": 4.055263042449951,
"learning_rate": 5e-06,
"loss": 1.2917,
"mean_token_accuracy": 0.672488197684288,
"num_tokens": 2144473.0,
"step": 164
},
{
"epoch": 0.1056,
"grad_norm": 3.9197511672973633,
"learning_rate": 5e-06,
"loss": 1.3064,
"mean_token_accuracy": 0.664051964879036,
"num_tokens": 2157277.0,
"step": 165
},
{
"epoch": 0.10624,
"grad_norm": 4.073387145996094,
"learning_rate": 5e-06,
"loss": 1.3085,
"mean_token_accuracy": 0.6389222107827663,
"num_tokens": 2168765.0,
"step": 166
},
{
"epoch": 0.10688,
"grad_norm": 3.508542060852051,
"learning_rate": 5e-06,
"loss": 1.2401,
"mean_token_accuracy": 0.6622222438454628,
"num_tokens": 2182480.0,
"step": 167
},
{
"epoch": 0.10752,
"grad_norm": 5.038687229156494,
"learning_rate": 5e-06,
"loss": 1.3458,
"mean_token_accuracy": 0.6548017933964729,
"num_tokens": 2192216.0,
"step": 168
},
{
"epoch": 0.10816,
"grad_norm": 3.743532180786133,
"learning_rate": 5e-06,
"loss": 1.3079,
"mean_token_accuracy": 0.675239585340023,
"num_tokens": 2205231.0,
"step": 169
},
{
"epoch": 0.1088,
"grad_norm": 3.9550719261169434,
"learning_rate": 5e-06,
"loss": 1.4297,
"mean_token_accuracy": 0.6310381144285202,
"num_tokens": 2219210.0,
"step": 170
},
{
"epoch": 0.10944,
"grad_norm": 3.988621950149536,
"learning_rate": 5e-06,
"loss": 1.2593,
"mean_token_accuracy": 0.6804088428616524,
"num_tokens": 2232718.0,
"step": 171
},
{
"epoch": 0.11008,
"grad_norm": 4.214746475219727,
"learning_rate": 5e-06,
"loss": 1.1987,
"mean_token_accuracy": 0.6554268151521683,
"num_tokens": 2244509.0,
"step": 172
},
{
"epoch": 0.11072,
"grad_norm": 4.047118186950684,
"learning_rate": 5e-06,
"loss": 1.3908,
"mean_token_accuracy": 0.6510942876338959,
"num_tokens": 2256799.0,
"step": 173
},
{
"epoch": 0.11136,
"grad_norm": 4.169956207275391,
"learning_rate": 5e-06,
"loss": 1.1637,
"mean_token_accuracy": 0.6976971700787544,
"num_tokens": 2267854.0,
"step": 174
},
{
"epoch": 0.112,
"grad_norm": 4.0025434494018555,
"learning_rate": 5e-06,
"loss": 1.2681,
"mean_token_accuracy": 0.6660499349236488,
"num_tokens": 2281207.0,
"step": 175
},
{
"epoch": 0.11264,
"grad_norm": 3.6148102283477783,
"learning_rate": 5e-06,
"loss": 1.0893,
"mean_token_accuracy": 0.6946917325258255,
"num_tokens": 2294364.0,
"step": 176
},
{
"epoch": 0.11328,
"grad_norm": 4.246650695800781,
"learning_rate": 5e-06,
"loss": 1.3055,
"mean_token_accuracy": 0.6432985439896584,
"num_tokens": 2304580.0,
"step": 177
},
{
"epoch": 0.11392,
"grad_norm": 3.6579151153564453,
"learning_rate": 5e-06,
"loss": 1.3814,
"mean_token_accuracy": 0.6313494071364403,
"num_tokens": 2319903.0,
"step": 178
},
{
"epoch": 0.11456,
"grad_norm": 3.988365411758423,
"learning_rate": 5e-06,
"loss": 1.1713,
"mean_token_accuracy": 0.6748137697577477,
"num_tokens": 2334193.0,
"step": 179
},
{
"epoch": 0.1152,
"grad_norm": 4.839256286621094,
"learning_rate": 5e-06,
"loss": 1.6099,
"mean_token_accuracy": 0.642399325966835,
"num_tokens": 2344137.0,
"step": 180
},
{
"epoch": 0.11584,
"grad_norm": 3.8175253868103027,
"learning_rate": 5e-06,
"loss": 1.3848,
"mean_token_accuracy": 0.6434383615851402,
"num_tokens": 2356787.0,
"step": 181
},
{
"epoch": 0.11648,
"grad_norm": 4.244999885559082,
"learning_rate": 5e-06,
"loss": 1.3926,
"mean_token_accuracy": 0.6472559943795204,
"num_tokens": 2369124.0,
"step": 182
},
{
"epoch": 0.11712,
"grad_norm": 3.850306749343872,
"learning_rate": 5e-06,
"loss": 1.266,
"mean_token_accuracy": 0.6437094509601593,
"num_tokens": 2383314.0,
"step": 183
},
{
"epoch": 0.11776,
"grad_norm": 5.292626857757568,
"learning_rate": 5e-06,
"loss": 1.7328,
"mean_token_accuracy": 0.5816368944942951,
"num_tokens": 2392822.0,
"step": 184
},
{
"epoch": 0.1184,
"grad_norm": 4.827669620513916,
"learning_rate": 5e-06,
"loss": 1.6344,
"mean_token_accuracy": 0.6154987290501595,
"num_tokens": 2404362.0,
"step": 185
},
{
"epoch": 0.11904,
"grad_norm": 4.1474995613098145,
"learning_rate": 5e-06,
"loss": 1.3776,
"mean_token_accuracy": 0.6394111067056656,
"num_tokens": 2415206.0,
"step": 186
},
{
"epoch": 0.11968,
"grad_norm": 4.1867995262146,
"learning_rate": 5e-06,
"loss": 1.1555,
"mean_token_accuracy": 0.6839761063456535,
"num_tokens": 2428366.0,
"step": 187
},
{
"epoch": 0.12032,
"grad_norm": 3.8448567390441895,
"learning_rate": 5e-06,
"loss": 1.3755,
"mean_token_accuracy": 0.6562356427311897,
"num_tokens": 2440401.0,
"step": 188
},
{
"epoch": 0.12096,
"grad_norm": 3.82326078414917,
"learning_rate": 5e-06,
"loss": 1.2447,
"mean_token_accuracy": 0.651421345770359,
"num_tokens": 2453194.0,
"step": 189
},
{
"epoch": 0.1216,
"grad_norm": 3.8324315547943115,
"learning_rate": 5e-06,
"loss": 1.4677,
"mean_token_accuracy": 0.6340256333351135,
"num_tokens": 2466328.0,
"step": 190
},
{
"epoch": 0.12224,
"grad_norm": 3.4532899856567383,
"learning_rate": 5e-06,
"loss": 1.3832,
"mean_token_accuracy": 0.6311789453029633,
"num_tokens": 2480165.0,
"step": 191
},
{
"epoch": 0.12288,
"grad_norm": 6.352081298828125,
"learning_rate": 5e-06,
"loss": 1.5605,
"mean_token_accuracy": 0.6426242366433144,
"num_tokens": 2489563.0,
"step": 192
},
{
"epoch": 0.12352,
"grad_norm": 3.9290707111358643,
"learning_rate": 5e-06,
"loss": 1.4923,
"mean_token_accuracy": 0.621891662478447,
"num_tokens": 2503752.0,
"step": 193
},
{
"epoch": 0.12416,
"grad_norm": 3.5599541664123535,
"learning_rate": 5e-06,
"loss": 1.658,
"mean_token_accuracy": 0.5923861265182495,
"num_tokens": 2517941.0,
"step": 194
},
{
"epoch": 0.1248,
"grad_norm": 4.907262802124023,
"learning_rate": 5e-06,
"loss": 1.3243,
"mean_token_accuracy": 0.684236004948616,
"num_tokens": 2531694.0,
"step": 195
},
{
"epoch": 0.12544,
"grad_norm": 3.895585298538208,
"learning_rate": 5e-06,
"loss": 1.2594,
"mean_token_accuracy": 0.6675282418727875,
"num_tokens": 2543318.0,
"step": 196
},
{
"epoch": 0.12608,
"grad_norm": 3.7483768463134766,
"learning_rate": 5e-06,
"loss": 1.3555,
"mean_token_accuracy": 0.6702639237046242,
"num_tokens": 2555858.0,
"step": 197
},
{
"epoch": 0.12672,
"grad_norm": 3.980715751647949,
"learning_rate": 5e-06,
"loss": 1.3247,
"mean_token_accuracy": 0.654958538711071,
"num_tokens": 2570848.0,
"step": 198
},
{
"epoch": 0.12736,
"grad_norm": 3.402679443359375,
"learning_rate": 5e-06,
"loss": 1.4151,
"mean_token_accuracy": 0.639847457408905,
"num_tokens": 2586951.0,
"step": 199
},
{
"epoch": 0.128,
"grad_norm": 3.603440284729004,
"learning_rate": 5e-06,
"loss": 1.138,
"mean_token_accuracy": 0.6972187757492065,
"num_tokens": 2600380.0,
"step": 200
},
{
"epoch": 0.12864,
"grad_norm": 4.226911544799805,
"learning_rate": 5e-06,
"loss": 1.2305,
"mean_token_accuracy": 0.6777093335986137,
"num_tokens": 2612169.0,
"step": 201
},
{
"epoch": 0.12928,
"grad_norm": 4.133816719055176,
"learning_rate": 5e-06,
"loss": 1.5127,
"mean_token_accuracy": 0.645031102001667,
"num_tokens": 2625681.0,
"step": 202
},
{
"epoch": 0.12992,
"grad_norm": 4.464379787445068,
"learning_rate": 5e-06,
"loss": 1.3419,
"mean_token_accuracy": 0.655508816242218,
"num_tokens": 2638449.0,
"step": 203
},
{
"epoch": 0.13056,
"grad_norm": 3.691314697265625,
"learning_rate": 5e-06,
"loss": 1.4329,
"mean_token_accuracy": 0.6271785870194435,
"num_tokens": 2651404.0,
"step": 204
},
{
"epoch": 0.1312,
"grad_norm": 3.735065460205078,
"learning_rate": 5e-06,
"loss": 1.259,
"mean_token_accuracy": 0.6792504116892815,
"num_tokens": 2663577.0,
"step": 205
},
{
"epoch": 0.13184,
"grad_norm": 3.8141613006591797,
"learning_rate": 5e-06,
"loss": 1.3812,
"mean_token_accuracy": 0.6525682806968689,
"num_tokens": 2675704.0,
"step": 206
},
{
"epoch": 0.13248,
"grad_norm": 4.096824645996094,
"learning_rate": 5e-06,
"loss": 1.4175,
"mean_token_accuracy": 0.6340369358658791,
"num_tokens": 2687284.0,
"step": 207
},
{
"epoch": 0.13312,
"grad_norm": 4.180744171142578,
"learning_rate": 5e-06,
"loss": 1.37,
"mean_token_accuracy": 0.6400524824857712,
"num_tokens": 2700343.0,
"step": 208
},
{
"epoch": 0.13376,
"grad_norm": 4.275300979614258,
"learning_rate": 5e-06,
"loss": 1.3739,
"mean_token_accuracy": 0.6336183995008469,
"num_tokens": 2713760.0,
"step": 209
},
{
"epoch": 0.1344,
"grad_norm": 3.547708511352539,
"learning_rate": 5e-06,
"loss": 1.4059,
"mean_token_accuracy": 0.6354609504342079,
"num_tokens": 2728045.0,
"step": 210
},
{
"epoch": 0.13504,
"grad_norm": 4.222541809082031,
"learning_rate": 5e-06,
"loss": 1.3929,
"mean_token_accuracy": 0.6449902206659317,
"num_tokens": 2741308.0,
"step": 211
},
{
"epoch": 0.13568,
"grad_norm": 3.930753707885742,
"learning_rate": 5e-06,
"loss": 1.5346,
"mean_token_accuracy": 0.6030523180961609,
"num_tokens": 2754620.0,
"step": 212
},
{
"epoch": 0.13632,
"grad_norm": 3.6813647747039795,
"learning_rate": 5e-06,
"loss": 1.3267,
"mean_token_accuracy": 0.6522213146090508,
"num_tokens": 2768180.0,
"step": 213
},
{
"epoch": 0.13696,
"grad_norm": 4.064117431640625,
"learning_rate": 5e-06,
"loss": 1.3081,
"mean_token_accuracy": 0.6697950512170792,
"num_tokens": 2781050.0,
"step": 214
},
{
"epoch": 0.1376,
"grad_norm": 3.927386522293091,
"learning_rate": 5e-06,
"loss": 1.5213,
"mean_token_accuracy": 0.6275500729680061,
"num_tokens": 2792644.0,
"step": 215
},
{
"epoch": 0.13824,
"grad_norm": 3.762558937072754,
"learning_rate": 5e-06,
"loss": 1.339,
"mean_token_accuracy": 0.6525787115097046,
"num_tokens": 2805857.0,
"step": 216
},
{
"epoch": 0.13888,
"grad_norm": 3.3911473751068115,
"learning_rate": 5e-06,
"loss": 1.3861,
"mean_token_accuracy": 0.646359771490097,
"num_tokens": 2822403.0,
"step": 217
},
{
"epoch": 0.13952,
"grad_norm": 3.3811612129211426,
"learning_rate": 5e-06,
"loss": 1.4414,
"mean_token_accuracy": 0.6381309777498245,
"num_tokens": 2836579.0,
"step": 218
},
{
"epoch": 0.14016,
"grad_norm": 3.9682304859161377,
"learning_rate": 5e-06,
"loss": 1.4728,
"mean_token_accuracy": 0.611208513379097,
"num_tokens": 2848833.0,
"step": 219
},
{
"epoch": 0.1408,
"grad_norm": 4.066648483276367,
"learning_rate": 5e-06,
"loss": 1.3081,
"mean_token_accuracy": 0.6451118811964989,
"num_tokens": 2859672.0,
"step": 220
},
{
"epoch": 0.14144,
"grad_norm": 3.577544927597046,
"learning_rate": 5e-06,
"loss": 1.4143,
"mean_token_accuracy": 0.6415835171937943,
"num_tokens": 2875418.0,
"step": 221
},
{
"epoch": 0.14208,
"grad_norm": 3.8373844623565674,
"learning_rate": 5e-06,
"loss": 1.1867,
"mean_token_accuracy": 0.6628148853778839,
"num_tokens": 2886843.0,
"step": 222
},
{
"epoch": 0.14272,
"grad_norm": 3.243741273880005,
"learning_rate": 5e-06,
"loss": 1.3639,
"mean_token_accuracy": 0.6402468308806419,
"num_tokens": 2903000.0,
"step": 223
},
{
"epoch": 0.14336,
"grad_norm": 3.6917643547058105,
"learning_rate": 5e-06,
"loss": 1.4826,
"mean_token_accuracy": 0.6137516796588898,
"num_tokens": 2916086.0,
"step": 224
},
{
"epoch": 0.144,
"grad_norm": 3.6961069107055664,
"learning_rate": 5e-06,
"loss": 1.3914,
"mean_token_accuracy": 0.6221867948770523,
"num_tokens": 2928082.0,
"step": 225
},
{
"epoch": 0.14464,
"grad_norm": 3.3489155769348145,
"learning_rate": 5e-06,
"loss": 1.2829,
"mean_token_accuracy": 0.6520458236336708,
"num_tokens": 2941972.0,
"step": 226
},
{
"epoch": 0.14528,
"grad_norm": 3.9291248321533203,
"learning_rate": 5e-06,
"loss": 1.4375,
"mean_token_accuracy": 0.6534432545304298,
"num_tokens": 2954326.0,
"step": 227
},
{
"epoch": 0.14592,
"grad_norm": 4.408154487609863,
"learning_rate": 5e-06,
"loss": 1.37,
"mean_token_accuracy": 0.6362905651330948,
"num_tokens": 2964773.0,
"step": 228
},
{
"epoch": 0.14656,
"grad_norm": 3.3480911254882812,
"learning_rate": 5e-06,
"loss": 1.2627,
"mean_token_accuracy": 0.664552852511406,
"num_tokens": 2979164.0,
"step": 229
},
{
"epoch": 0.1472,
"grad_norm": 3.5520999431610107,
"learning_rate": 5e-06,
"loss": 1.2172,
"mean_token_accuracy": 0.677513062953949,
"num_tokens": 2993457.0,
"step": 230
},
{
"epoch": 0.14784,
"grad_norm": 3.3027398586273193,
"learning_rate": 5e-06,
"loss": 1.4642,
"mean_token_accuracy": 0.6177601739764214,
"num_tokens": 3007665.0,
"step": 231
},
{
"epoch": 0.14848,
"grad_norm": 3.64074444770813,
"learning_rate": 5e-06,
"loss": 1.2464,
"mean_token_accuracy": 0.662057913839817,
"num_tokens": 3020550.0,
"step": 232
},
{
"epoch": 0.14912,
"grad_norm": 3.9199254512786865,
"learning_rate": 5e-06,
"loss": 1.3942,
"mean_token_accuracy": 0.6174388378858566,
"num_tokens": 3034384.0,
"step": 233
},
{
"epoch": 0.14976,
"grad_norm": 4.028416633605957,
"learning_rate": 5e-06,
"loss": 1.2443,
"mean_token_accuracy": 0.6706736162304878,
"num_tokens": 3046491.0,
"step": 234
},
{
"epoch": 0.1504,
"grad_norm": 3.4330265522003174,
"learning_rate": 5e-06,
"loss": 1.4317,
"mean_token_accuracy": 0.6397150233387947,
"num_tokens": 3061547.0,
"step": 235
},
{
"epoch": 0.15104,
"grad_norm": 4.62261438369751,
"learning_rate": 5e-06,
"loss": 1.4654,
"mean_token_accuracy": 0.6216901019215584,
"num_tokens": 3073316.0,
"step": 236
},
{
"epoch": 0.15168,
"grad_norm": 3.8148386478424072,
"learning_rate": 5e-06,
"loss": 1.3572,
"mean_token_accuracy": 0.6381306573748589,
"num_tokens": 3086074.0,
"step": 237
},
{
"epoch": 0.15232,
"grad_norm": 3.6774654388427734,
"learning_rate": 5e-06,
"loss": 1.2743,
"mean_token_accuracy": 0.6746890023350716,
"num_tokens": 3099218.0,
"step": 238
},
{
"epoch": 0.15296,
"grad_norm": 3.8915648460388184,
"learning_rate": 5e-06,
"loss": 1.3005,
"mean_token_accuracy": 0.6652230620384216,
"num_tokens": 3113283.0,
"step": 239
},
{
"epoch": 0.1536,
"grad_norm": 3.641663074493408,
"learning_rate": 5e-06,
"loss": 1.4299,
"mean_token_accuracy": 0.6305139660835266,
"num_tokens": 3127092.0,
"step": 240
},
{
"epoch": 0.15424,
"grad_norm": 3.9802157878875732,
"learning_rate": 5e-06,
"loss": 1.3628,
"mean_token_accuracy": 0.6499741598963737,
"num_tokens": 3137977.0,
"step": 241
},
{
"epoch": 0.15488,
"grad_norm": 3.3519856929779053,
"learning_rate": 5e-06,
"loss": 1.3649,
"mean_token_accuracy": 0.6538999378681183,
"num_tokens": 3153296.0,
"step": 242
},
{
"epoch": 0.15552,
"grad_norm": 3.9312145709991455,
"learning_rate": 5e-06,
"loss": 1.1886,
"mean_token_accuracy": 0.687839575111866,
"num_tokens": 3165430.0,
"step": 243
},
{
"epoch": 0.15616,
"grad_norm": 3.9684488773345947,
"learning_rate": 5e-06,
"loss": 1.392,
"mean_token_accuracy": 0.629355788230896,
"num_tokens": 3176799.0,
"step": 244
},
{
"epoch": 0.1568,
"grad_norm": 3.610091209411621,
"learning_rate": 5e-06,
"loss": 1.3166,
"mean_token_accuracy": 0.6479083597660065,
"num_tokens": 3190258.0,
"step": 245
},
{
"epoch": 0.15744,
"grad_norm": 3.921807289123535,
"learning_rate": 5e-06,
"loss": 1.1064,
"mean_token_accuracy": 0.7143979370594025,
"num_tokens": 3201789.0,
"step": 246
},
{
"epoch": 0.15808,
"grad_norm": 3.4888627529144287,
"learning_rate": 5e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.6818583980202675,
"num_tokens": 3214773.0,
"step": 247
},
{
"epoch": 0.15872,
"grad_norm": 3.9141690731048584,
"learning_rate": 5e-06,
"loss": 1.2463,
"mean_token_accuracy": 0.6758697032928467,
"num_tokens": 3226302.0,
"step": 248
},
{
"epoch": 0.15936,
"grad_norm": 3.585526943206787,
"learning_rate": 5e-06,
"loss": 1.29,
"mean_token_accuracy": 0.6522084772586823,
"num_tokens": 3239487.0,
"step": 249
},
{
"epoch": 0.16,
"grad_norm": 2.9985756874084473,
"learning_rate": 5e-06,
"loss": 1.2875,
"mean_token_accuracy": 0.665367841720581,
"num_tokens": 3254553.0,
"step": 250
},
{
"epoch": 0.16064,
"grad_norm": 4.460598945617676,
"learning_rate": 5e-06,
"loss": 1.413,
"mean_token_accuracy": 0.6606506556272507,
"num_tokens": 3266374.0,
"step": 251
},
{
"epoch": 0.16128,
"grad_norm": 3.867008686065674,
"learning_rate": 5e-06,
"loss": 1.4733,
"mean_token_accuracy": 0.6357235088944435,
"num_tokens": 3278642.0,
"step": 252
},
{
"epoch": 0.16192,
"grad_norm": 3.6840028762817383,
"learning_rate": 5e-06,
"loss": 1.3735,
"mean_token_accuracy": 0.6500705629587173,
"num_tokens": 3292643.0,
"step": 253
},
{
"epoch": 0.16256,
"grad_norm": 3.631727933883667,
"learning_rate": 5e-06,
"loss": 1.3561,
"mean_token_accuracy": 0.6603741720318794,
"num_tokens": 3308572.0,
"step": 254
},
{
"epoch": 0.1632,
"grad_norm": 3.8139543533325195,
"learning_rate": 5e-06,
"loss": 1.4079,
"mean_token_accuracy": 0.6566642299294472,
"num_tokens": 3321852.0,
"step": 255
},
{
"epoch": 0.16384,
"grad_norm": 4.278744697570801,
"learning_rate": 5e-06,
"loss": 1.3128,
"mean_token_accuracy": 0.6340290307998657,
"num_tokens": 3333364.0,
"step": 256
},
{
"epoch": 0.16448,
"grad_norm": 3.855288505554199,
"learning_rate": 5e-06,
"loss": 1.2726,
"mean_token_accuracy": 0.6573414877057076,
"num_tokens": 3346153.0,
"step": 257
},
{
"epoch": 0.16512,
"grad_norm": 3.894836187362671,
"learning_rate": 5e-06,
"loss": 1.5052,
"mean_token_accuracy": 0.6395176202058792,
"num_tokens": 3357803.0,
"step": 258
},
{
"epoch": 0.16576,
"grad_norm": 3.7376608848571777,
"learning_rate": 5e-06,
"loss": 1.3856,
"mean_token_accuracy": 0.6377875059843063,
"num_tokens": 3370640.0,
"step": 259
},
{
"epoch": 0.1664,
"grad_norm": 3.66434907913208,
"learning_rate": 5e-06,
"loss": 1.2933,
"mean_token_accuracy": 0.6526513993740082,
"num_tokens": 3384626.0,
"step": 260
},
{
"epoch": 0.16704,
"grad_norm": 4.31889533996582,
"learning_rate": 5e-06,
"loss": 1.4037,
"mean_token_accuracy": 0.6519733518362045,
"num_tokens": 3396351.0,
"step": 261
},
{
"epoch": 0.16768,
"grad_norm": 4.194382667541504,
"learning_rate": 5e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.6719919368624687,
"num_tokens": 3410809.0,
"step": 262
},
{
"epoch": 0.16832,
"grad_norm": 5.298657417297363,
"learning_rate": 5e-06,
"loss": 1.2344,
"mean_token_accuracy": 0.6541409119963646,
"num_tokens": 3421666.0,
"step": 263
},
{
"epoch": 0.16896,
"grad_norm": 3.7578792572021484,
"learning_rate": 5e-06,
"loss": 1.4221,
"mean_token_accuracy": 0.6374265551567078,
"num_tokens": 3435240.0,
"step": 264
},
{
"epoch": 0.1696,
"grad_norm": 4.36591100692749,
"learning_rate": 5e-06,
"loss": 1.3996,
"mean_token_accuracy": 0.6582349985837936,
"num_tokens": 3447417.0,
"step": 265
},
{
"epoch": 0.17024,
"grad_norm": 4.242166042327881,
"learning_rate": 5e-06,
"loss": 1.2213,
"mean_token_accuracy": 0.6886605694890022,
"num_tokens": 3457202.0,
"step": 266
},
{
"epoch": 0.17088,
"grad_norm": 4.421549320220947,
"learning_rate": 5e-06,
"loss": 1.4154,
"mean_token_accuracy": 0.6361653730273247,
"num_tokens": 3470888.0,
"step": 267
},
{
"epoch": 0.17152,
"grad_norm": 3.4272501468658447,
"learning_rate": 5e-06,
"loss": 1.4722,
"mean_token_accuracy": 0.617170162498951,
"num_tokens": 3483711.0,
"step": 268
},
{
"epoch": 0.17216,
"grad_norm": 4.099259853363037,
"learning_rate": 5e-06,
"loss": 1.3181,
"mean_token_accuracy": 0.6635381802916527,
"num_tokens": 3494261.0,
"step": 269
},
{
"epoch": 0.1728,
"grad_norm": 3.460908889770508,
"learning_rate": 5e-06,
"loss": 1.2027,
"mean_token_accuracy": 0.6816031113266945,
"num_tokens": 3508416.0,
"step": 270
},
{
"epoch": 0.17344,
"grad_norm": 4.011609077453613,
"learning_rate": 5e-06,
"loss": 1.2527,
"mean_token_accuracy": 0.6691607385873795,
"num_tokens": 3521566.0,
"step": 271
},
{
"epoch": 0.17408,
"grad_norm": 4.310615062713623,
"learning_rate": 5e-06,
"loss": 1.5243,
"mean_token_accuracy": 0.606864832341671,
"num_tokens": 3532437.0,
"step": 272
},
{
"epoch": 0.17472,
"grad_norm": 3.865201950073242,
"learning_rate": 5e-06,
"loss": 1.3655,
"mean_token_accuracy": 0.6517080217599869,
"num_tokens": 3544654.0,
"step": 273
},
{
"epoch": 0.17536,
"grad_norm": 3.779001235961914,
"learning_rate": 5e-06,
"loss": 1.5361,
"mean_token_accuracy": 0.6139826104044914,
"num_tokens": 3560143.0,
"step": 274
},
{
"epoch": 0.176,
"grad_norm": 3.909745454788208,
"learning_rate": 5e-06,
"loss": 1.2,
"mean_token_accuracy": 0.6911701187491417,
"num_tokens": 3571846.0,
"step": 275
},
{
"epoch": 0.17664,
"grad_norm": 4.487984657287598,
"learning_rate": 5e-06,
"loss": 1.307,
"mean_token_accuracy": 0.6519964337348938,
"num_tokens": 3583280.0,
"step": 276
},
{
"epoch": 0.17728,
"grad_norm": 4.58504056930542,
"learning_rate": 5e-06,
"loss": 1.4673,
"mean_token_accuracy": 0.6244921982288361,
"num_tokens": 3593797.0,
"step": 277
},
{
"epoch": 0.17792,
"grad_norm": 3.6989223957061768,
"learning_rate": 5e-06,
"loss": 1.4841,
"mean_token_accuracy": 0.6436078920960426,
"num_tokens": 3606077.0,
"step": 278
},
{
"epoch": 0.17856,
"grad_norm": 3.5363776683807373,
"learning_rate": 5e-06,
"loss": 1.3562,
"mean_token_accuracy": 0.6404093876481056,
"num_tokens": 3619274.0,
"step": 279
},
{
"epoch": 0.1792,
"grad_norm": 3.5803604125976562,
"learning_rate": 5e-06,
"loss": 1.2417,
"mean_token_accuracy": 0.6808914020657539,
"num_tokens": 3631353.0,
"step": 280
},
{
"epoch": 0.17984,
"grad_norm": 3.8783459663391113,
"learning_rate": 5e-06,
"loss": 1.3802,
"mean_token_accuracy": 0.6372303292155266,
"num_tokens": 3645098.0,
"step": 281
},
{
"epoch": 0.18048,
"grad_norm": 4.057406425476074,
"learning_rate": 5e-06,
"loss": 1.2089,
"mean_token_accuracy": 0.6749606877565384,
"num_tokens": 3657936.0,
"step": 282
},
{
"epoch": 0.18112,
"grad_norm": 3.0335772037506104,
"learning_rate": 5e-06,
"loss": 1.1465,
"mean_token_accuracy": 0.6885220557451248,
"num_tokens": 3672249.0,
"step": 283
},
{
"epoch": 0.18176,
"grad_norm": 3.654318332672119,
"learning_rate": 5e-06,
"loss": 1.2322,
"mean_token_accuracy": 0.6707694306969643,
"num_tokens": 3685850.0,
"step": 284
},
{
"epoch": 0.1824,
"grad_norm": 3.4704298973083496,
"learning_rate": 5e-06,
"loss": 1.1906,
"mean_token_accuracy": 0.6730613932013512,
"num_tokens": 3699407.0,
"step": 285
},
{
"epoch": 0.18304,
"grad_norm": 4.028052806854248,
"learning_rate": 5e-06,
"loss": 1.511,
"mean_token_accuracy": 0.6216867938637733,
"num_tokens": 3711240.0,
"step": 286
},
{
"epoch": 0.18368,
"grad_norm": 3.9164350032806396,
"learning_rate": 5e-06,
"loss": 1.2674,
"mean_token_accuracy": 0.6724821552634239,
"num_tokens": 3723872.0,
"step": 287
},
{
"epoch": 0.18432,
"grad_norm": 4.470592498779297,
"learning_rate": 5e-06,
"loss": 1.3975,
"mean_token_accuracy": 0.6604571491479874,
"num_tokens": 3737013.0,
"step": 288
},
{
"epoch": 0.18496,
"grad_norm": 3.5540971755981445,
"learning_rate": 5e-06,
"loss": 1.1055,
"mean_token_accuracy": 0.683054082095623,
"num_tokens": 3750893.0,
"step": 289
},
{
"epoch": 0.1856,
"grad_norm": 3.6694583892822266,
"learning_rate": 5e-06,
"loss": 1.4439,
"mean_token_accuracy": 0.6296076104044914,
"num_tokens": 3763965.0,
"step": 290
},
{
"epoch": 0.18624,
"grad_norm": 4.5381059646606445,
"learning_rate": 5e-06,
"loss": 1.4017,
"mean_token_accuracy": 0.646364264190197,
"num_tokens": 3774483.0,
"step": 291
},
{
"epoch": 0.18688,
"grad_norm": 3.607478141784668,
"learning_rate": 5e-06,
"loss": 1.5724,
"mean_token_accuracy": 0.6356127932667732,
"num_tokens": 3791198.0,
"step": 292
},
{
"epoch": 0.18752,
"grad_norm": 3.7672901153564453,
"learning_rate": 5e-06,
"loss": 1.5793,
"mean_token_accuracy": 0.6113990694284439,
"num_tokens": 3805077.0,
"step": 293
},
{
"epoch": 0.18816,
"grad_norm": 3.517371892929077,
"learning_rate": 5e-06,
"loss": 1.3182,
"mean_token_accuracy": 0.6544737070798874,
"num_tokens": 3819471.0,
"step": 294
},
{
"epoch": 0.1888,
"grad_norm": 3.6588094234466553,
"learning_rate": 5e-06,
"loss": 1.1415,
"mean_token_accuracy": 0.6868576034903526,
"num_tokens": 3833299.0,
"step": 295
},
{
"epoch": 0.18944,
"grad_norm": 4.042988300323486,
"learning_rate": 5e-06,
"loss": 1.331,
"mean_token_accuracy": 0.6815094500780106,
"num_tokens": 3845749.0,
"step": 296
},
{
"epoch": 0.19008,
"grad_norm": 3.829592227935791,
"learning_rate": 5e-06,
"loss": 1.5645,
"mean_token_accuracy": 0.6153044253587723,
"num_tokens": 3858961.0,
"step": 297
},
{
"epoch": 0.19072,
"grad_norm": 4.074889659881592,
"learning_rate": 5e-06,
"loss": 1.4884,
"mean_token_accuracy": 0.6340715438127518,
"num_tokens": 3870935.0,
"step": 298
},
{
"epoch": 0.19136,
"grad_norm": 3.7292230129241943,
"learning_rate": 5e-06,
"loss": 1.461,
"mean_token_accuracy": 0.6340260431170464,
"num_tokens": 3883149.0,
"step": 299
},
{
"epoch": 0.192,
"grad_norm": 3.7191953659057617,
"learning_rate": 5e-06,
"loss": 1.2,
"mean_token_accuracy": 0.6816589832305908,
"num_tokens": 3896395.0,
"step": 300
},
{
"epoch": 0.19264,
"grad_norm": 3.5360212326049805,
"learning_rate": 5e-06,
"loss": 1.0832,
"mean_token_accuracy": 0.6812401190400124,
"num_tokens": 3912111.0,
"step": 301
},
{
"epoch": 0.19328,
"grad_norm": 4.3103132247924805,
"learning_rate": 5e-06,
"loss": 1.1345,
"mean_token_accuracy": 0.6864209771156311,
"num_tokens": 3922023.0,
"step": 302
},
{
"epoch": 0.19392,
"grad_norm": 4.525723457336426,
"learning_rate": 5e-06,
"loss": 1.1642,
"mean_token_accuracy": 0.6812352165579796,
"num_tokens": 3933051.0,
"step": 303
},
{
"epoch": 0.19456,
"grad_norm": 4.2806172370910645,
"learning_rate": 5e-06,
"loss": 1.4921,
"mean_token_accuracy": 0.6231048293411732,
"num_tokens": 3943203.0,
"step": 304
},
{
"epoch": 0.1952,
"grad_norm": 3.759788751602173,
"learning_rate": 5e-06,
"loss": 1.4501,
"mean_token_accuracy": 0.6561701893806458,
"num_tokens": 3956583.0,
"step": 305
},
{
"epoch": 0.19584,
"grad_norm": 3.7161481380462646,
"learning_rate": 5e-06,
"loss": 1.4199,
"mean_token_accuracy": 0.6432743892073631,
"num_tokens": 3968468.0,
"step": 306
},
{
"epoch": 0.19648,
"grad_norm": 3.6811437606811523,
"learning_rate": 5e-06,
"loss": 1.3131,
"mean_token_accuracy": 0.6686923652887344,
"num_tokens": 3980727.0,
"step": 307
},
{
"epoch": 0.19712,
"grad_norm": 4.159343242645264,
"learning_rate": 5e-06,
"loss": 1.4896,
"mean_token_accuracy": 0.6509639658033848,
"num_tokens": 3993831.0,
"step": 308
},
{
"epoch": 0.19776,
"grad_norm": 3.5082013607025146,
"learning_rate": 5e-06,
"loss": 1.1129,
"mean_token_accuracy": 0.7138783186674118,
"num_tokens": 4006704.0,
"step": 309
},
{
"epoch": 0.1984,
"grad_norm": 4.171331882476807,
"learning_rate": 5e-06,
"loss": 1.2373,
"mean_token_accuracy": 0.6580014526844025,
"num_tokens": 4023069.0,
"step": 310
},
{
"epoch": 0.19904,
"grad_norm": 3.516143321990967,
"learning_rate": 5e-06,
"loss": 1.4212,
"mean_token_accuracy": 0.6425874978303909,
"num_tokens": 4036994.0,
"step": 311
},
{
"epoch": 0.19968,
"grad_norm": 3.506361484527588,
"learning_rate": 5e-06,
"loss": 1.5113,
"mean_token_accuracy": 0.6125459745526314,
"num_tokens": 4050240.0,
"step": 312
},
{
"epoch": 0.20032,
"grad_norm": 4.198498725891113,
"learning_rate": 5e-06,
"loss": 1.4596,
"mean_token_accuracy": 0.6291738748550415,
"num_tokens": 4061778.0,
"step": 313
},
{
"epoch": 0.20096,
"grad_norm": 3.3201327323913574,
"learning_rate": 5e-06,
"loss": 1.4545,
"mean_token_accuracy": 0.6387949883937836,
"num_tokens": 4076918.0,
"step": 314
},
{
"epoch": 0.2016,
"grad_norm": 3.174764394760132,
"learning_rate": 5e-06,
"loss": 1.2661,
"mean_token_accuracy": 0.6788045838475227,
"num_tokens": 4091861.0,
"step": 315
},
{
"epoch": 0.20224,
"grad_norm": 3.773123264312744,
"learning_rate": 5e-06,
"loss": 1.0687,
"mean_token_accuracy": 0.7004147991538048,
"num_tokens": 4103615.0,
"step": 316
},
{
"epoch": 0.20288,
"grad_norm": 3.759938955307007,
"learning_rate": 5e-06,
"loss": 1.3967,
"mean_token_accuracy": 0.6160966157913208,
"num_tokens": 4116084.0,
"step": 317
},
{
"epoch": 0.20352,
"grad_norm": 3.3908169269561768,
"learning_rate": 5e-06,
"loss": 1.3196,
"mean_token_accuracy": 0.6666592955589294,
"num_tokens": 4130227.0,
"step": 318
},
{
"epoch": 0.20416,
"grad_norm": 3.709275007247925,
"learning_rate": 5e-06,
"loss": 1.1854,
"mean_token_accuracy": 0.690848097205162,
"num_tokens": 4144753.0,
"step": 319
},
{
"epoch": 0.2048,
"grad_norm": 4.040079116821289,
"learning_rate": 5e-06,
"loss": 1.271,
"mean_token_accuracy": 0.6566968783736229,
"num_tokens": 4157685.0,
"step": 320
},
{
"epoch": 0.20544,
"grad_norm": 3.6473450660705566,
"learning_rate": 5e-06,
"loss": 1.2116,
"mean_token_accuracy": 0.6666957810521126,
"num_tokens": 4171592.0,
"step": 321
},
{
"epoch": 0.20608,
"grad_norm": 4.44047212600708,
"learning_rate": 5e-06,
"loss": 1.3629,
"mean_token_accuracy": 0.6269867643713951,
"num_tokens": 4182621.0,
"step": 322
},
{
"epoch": 0.20672,
"grad_norm": 4.875802993774414,
"learning_rate": 5e-06,
"loss": 1.251,
"mean_token_accuracy": 0.671268492937088,
"num_tokens": 4191893.0,
"step": 323
},
{
"epoch": 0.20736,
"grad_norm": 3.2327218055725098,
"learning_rate": 5e-06,
"loss": 1.2432,
"mean_token_accuracy": 0.6710969433188438,
"num_tokens": 4207608.0,
"step": 324
},
{
"epoch": 0.208,
"grad_norm": 3.433987617492676,
"learning_rate": 5e-06,
"loss": 1.4811,
"mean_token_accuracy": 0.641696572303772,
"num_tokens": 4222070.0,
"step": 325
},
{
"epoch": 0.20864,
"grad_norm": 3.3024795055389404,
"learning_rate": 5e-06,
"loss": 1.4027,
"mean_token_accuracy": 0.6242343187332153,
"num_tokens": 4237064.0,
"step": 326
},
{
"epoch": 0.20928,
"grad_norm": 3.8479273319244385,
"learning_rate": 5e-06,
"loss": 1.1806,
"mean_token_accuracy": 0.6728235110640526,
"num_tokens": 4251385.0,
"step": 327
},
{
"epoch": 0.20992,
"grad_norm": 3.911982774734497,
"learning_rate": 5e-06,
"loss": 1.3907,
"mean_token_accuracy": 0.633483037352562,
"num_tokens": 4264013.0,
"step": 328
},
{
"epoch": 0.21056,
"grad_norm": 3.055570125579834,
"learning_rate": 5e-06,
"loss": 1.5427,
"mean_token_accuracy": 0.6328605860471725,
"num_tokens": 4280497.0,
"step": 329
},
{
"epoch": 0.2112,
"grad_norm": 3.9111008644104004,
"learning_rate": 5e-06,
"loss": 1.3318,
"mean_token_accuracy": 0.6829836070537567,
"num_tokens": 4293657.0,
"step": 330
},
{
"epoch": 0.21184,
"grad_norm": 3.3383522033691406,
"learning_rate": 5e-06,
"loss": 1.3847,
"mean_token_accuracy": 0.6295205429196358,
"num_tokens": 4309618.0,
"step": 331
},
{
"epoch": 0.21248,
"grad_norm": 3.3280251026153564,
"learning_rate": 5e-06,
"loss": 1.5237,
"mean_token_accuracy": 0.6586425974965096,
"num_tokens": 4326147.0,
"step": 332
},
{
"epoch": 0.21312,
"grad_norm": 4.489631175994873,
"learning_rate": 5e-06,
"loss": 1.2546,
"mean_token_accuracy": 0.653937578201294,
"num_tokens": 4336964.0,
"step": 333
},
{
"epoch": 0.21376,
"grad_norm": 3.654022693634033,
"learning_rate": 5e-06,
"loss": 1.3246,
"mean_token_accuracy": 0.6411551535129547,
"num_tokens": 4349801.0,
"step": 334
},
{
"epoch": 0.2144,
"grad_norm": 3.9658567905426025,
"learning_rate": 5e-06,
"loss": 1.2223,
"mean_token_accuracy": 0.6976972743868828,
"num_tokens": 4362565.0,
"step": 335
},
{
"epoch": 0.21504,
"grad_norm": 4.284513473510742,
"learning_rate": 5e-06,
"loss": 1.2982,
"mean_token_accuracy": 0.6423984244465828,
"num_tokens": 4373113.0,
"step": 336
},
{
"epoch": 0.21568,
"grad_norm": 3.3546524047851562,
"learning_rate": 5e-06,
"loss": 1.5144,
"mean_token_accuracy": 0.6321973502635956,
"num_tokens": 4388315.0,
"step": 337
},
{
"epoch": 0.21632,
"grad_norm": 3.7386813163757324,
"learning_rate": 5e-06,
"loss": 1.2948,
"mean_token_accuracy": 0.6624687612056732,
"num_tokens": 4400428.0,
"step": 338
},
{
"epoch": 0.21696,
"grad_norm": 4.466668128967285,
"learning_rate": 5e-06,
"loss": 1.7564,
"mean_token_accuracy": 0.6115086637437344,
"num_tokens": 4412812.0,
"step": 339
},
{
"epoch": 0.2176,
"grad_norm": 3.6271438598632812,
"learning_rate": 5e-06,
"loss": 1.2008,
"mean_token_accuracy": 0.6809025183320045,
"num_tokens": 4427547.0,
"step": 340
},
{
"epoch": 0.21824,
"grad_norm": 4.270169258117676,
"learning_rate": 5e-06,
"loss": 1.4229,
"mean_token_accuracy": 0.6368228495121002,
"num_tokens": 4440979.0,
"step": 341
},
{
"epoch": 0.21888,
"grad_norm": 4.036962509155273,
"learning_rate": 5e-06,
"loss": 1.4317,
"mean_token_accuracy": 0.6311650201678276,
"num_tokens": 4452973.0,
"step": 342
},
{
"epoch": 0.21952,
"grad_norm": 3.645164728164673,
"learning_rate": 5e-06,
"loss": 1.2559,
"mean_token_accuracy": 0.6653162762522697,
"num_tokens": 4465907.0,
"step": 343
},
{
"epoch": 0.22016,
"grad_norm": 4.088701248168945,
"learning_rate": 5e-06,
"loss": 1.2515,
"mean_token_accuracy": 0.6554296687245369,
"num_tokens": 4477731.0,
"step": 344
},
{
"epoch": 0.2208,
"grad_norm": 3.935673713684082,
"learning_rate": 5e-06,
"loss": 1.23,
"mean_token_accuracy": 0.6872739866375923,
"num_tokens": 4490092.0,
"step": 345
},
{
"epoch": 0.22144,
"grad_norm": 3.8297736644744873,
"learning_rate": 5e-06,
"loss": 1.3338,
"mean_token_accuracy": 0.6665596142411232,
"num_tokens": 4502310.0,
"step": 346
},
{
"epoch": 0.22208,
"grad_norm": 3.4555552005767822,
"learning_rate": 5e-06,
"loss": 1.3386,
"mean_token_accuracy": 0.645504966378212,
"num_tokens": 4517152.0,
"step": 347
},
{
"epoch": 0.22272,
"grad_norm": 3.445380926132202,
"learning_rate": 5e-06,
"loss": 1.3176,
"mean_token_accuracy": 0.656374916434288,
"num_tokens": 4531588.0,
"step": 348
},
{
"epoch": 0.22336,
"grad_norm": 3.376492500305176,
"learning_rate": 5e-06,
"loss": 1.4416,
"mean_token_accuracy": 0.6500495374202728,
"num_tokens": 4548945.0,
"step": 349
},
{
"epoch": 0.224,
"grad_norm": 3.7682902812957764,
"learning_rate": 5e-06,
"loss": 1.1904,
"mean_token_accuracy": 0.7001358345150948,
"num_tokens": 4561085.0,
"step": 350
},
{
"epoch": 0.22464,
"grad_norm": 3.9040138721466064,
"learning_rate": 5e-06,
"loss": 1.3457,
"mean_token_accuracy": 0.6525379121303558,
"num_tokens": 4574945.0,
"step": 351
},
{
"epoch": 0.22528,
"grad_norm": 3.5685391426086426,
"learning_rate": 5e-06,
"loss": 1.3322,
"mean_token_accuracy": 0.6565421000123024,
"num_tokens": 4588253.0,
"step": 352
},
{
"epoch": 0.22592,
"grad_norm": 3.4802379608154297,
"learning_rate": 5e-06,
"loss": 1.2408,
"mean_token_accuracy": 0.6631387919187546,
"num_tokens": 4603347.0,
"step": 353
},
{
"epoch": 0.22656,
"grad_norm": 4.1048126220703125,
"learning_rate": 5e-06,
"loss": 1.2342,
"mean_token_accuracy": 0.7041416242718697,
"num_tokens": 4616180.0,
"step": 354
},
{
"epoch": 0.2272,
"grad_norm": 3.617142677307129,
"learning_rate": 5e-06,
"loss": 1.193,
"mean_token_accuracy": 0.68916055560112,
"num_tokens": 4628116.0,
"step": 355
},
{
"epoch": 0.22784,
"grad_norm": 3.48990797996521,
"learning_rate": 5e-06,
"loss": 1.3371,
"mean_token_accuracy": 0.6546562537550926,
"num_tokens": 4644302.0,
"step": 356
},
{
"epoch": 0.22848,
"grad_norm": 4.8016180992126465,
"learning_rate": 5e-06,
"loss": 1.3828,
"mean_token_accuracy": 0.6490079835057259,
"num_tokens": 4654201.0,
"step": 357
},
{
"epoch": 0.22912,
"grad_norm": 3.589632749557495,
"learning_rate": 5e-06,
"loss": 1.303,
"mean_token_accuracy": 0.6622688621282578,
"num_tokens": 4666579.0,
"step": 358
},
{
"epoch": 0.22976,
"grad_norm": 3.8532536029815674,
"learning_rate": 5e-06,
"loss": 1.2905,
"mean_token_accuracy": 0.6697241440415382,
"num_tokens": 4678614.0,
"step": 359
},
{
"epoch": 0.2304,
"grad_norm": 3.768440008163452,
"learning_rate": 5e-06,
"loss": 1.3384,
"mean_token_accuracy": 0.6274634152650833,
"num_tokens": 4690259.0,
"step": 360
},
{
"epoch": 0.23104,
"grad_norm": 4.048650741577148,
"learning_rate": 5e-06,
"loss": 1.27,
"mean_token_accuracy": 0.6553780138492584,
"num_tokens": 4702794.0,
"step": 361
},
{
"epoch": 0.23168,
"grad_norm": 3.264341354370117,
"learning_rate": 5e-06,
"loss": 1.2521,
"mean_token_accuracy": 0.7201630547642708,
"num_tokens": 4718863.0,
"step": 362
},
{
"epoch": 0.23232,
"grad_norm": 3.293111562728882,
"learning_rate": 5e-06,
"loss": 1.206,
"mean_token_accuracy": 0.691804438829422,
"num_tokens": 4731459.0,
"step": 363
},
{
"epoch": 0.23296,
"grad_norm": 3.562152862548828,
"learning_rate": 5e-06,
"loss": 1.5825,
"mean_token_accuracy": 0.6220528446137905,
"num_tokens": 4744183.0,
"step": 364
},
{
"epoch": 0.2336,
"grad_norm": 3.858302116394043,
"learning_rate": 5e-06,
"loss": 1.2556,
"mean_token_accuracy": 0.6687511652708054,
"num_tokens": 4755339.0,
"step": 365
},
{
"epoch": 0.23424,
"grad_norm": 3.6017565727233887,
"learning_rate": 5e-06,
"loss": 1.4059,
"mean_token_accuracy": 0.6345420032739639,
"num_tokens": 4767629.0,
"step": 366
},
{
"epoch": 0.23488,
"grad_norm": 3.706761598587036,
"learning_rate": 5e-06,
"loss": 1.1984,
"mean_token_accuracy": 0.6690258160233498,
"num_tokens": 4778905.0,
"step": 367
},
{
"epoch": 0.23552,
"grad_norm": 3.1312525272369385,
"learning_rate": 5e-06,
"loss": 1.3188,
"mean_token_accuracy": 0.6492372825741768,
"num_tokens": 4794948.0,
"step": 368
},
{
"epoch": 0.23616,
"grad_norm": 4.282083034515381,
"learning_rate": 5e-06,
"loss": 1.4944,
"mean_token_accuracy": 0.6254525110125542,
"num_tokens": 4807887.0,
"step": 369
},
{
"epoch": 0.2368,
"grad_norm": 3.156104564666748,
"learning_rate": 5e-06,
"loss": 1.4343,
"mean_token_accuracy": 0.6384943351149559,
"num_tokens": 4823135.0,
"step": 370
},
{
"epoch": 0.23744,
"grad_norm": 3.9901719093322754,
"learning_rate": 5e-06,
"loss": 1.2035,
"mean_token_accuracy": 0.7160904258489609,
"num_tokens": 4835841.0,
"step": 371
},
{
"epoch": 0.23808,
"grad_norm": 3.2367820739746094,
"learning_rate": 5e-06,
"loss": 1.276,
"mean_token_accuracy": 0.659798189997673,
"num_tokens": 4852490.0,
"step": 372
},
{
"epoch": 0.23872,
"grad_norm": 3.741534948348999,
"learning_rate": 5e-06,
"loss": 1.3245,
"mean_token_accuracy": 0.6480759754776955,
"num_tokens": 4864536.0,
"step": 373
},
{
"epoch": 0.23936,
"grad_norm": 5.090270042419434,
"learning_rate": 5e-06,
"loss": 1.3845,
"mean_token_accuracy": 0.6432400941848755,
"num_tokens": 4873861.0,
"step": 374
},
{
"epoch": 0.24,
"grad_norm": 3.550171136856079,
"learning_rate": 5e-06,
"loss": 1.305,
"mean_token_accuracy": 0.6453453898429871,
"num_tokens": 4888154.0,
"step": 375
},
{
"epoch": 0.24064,
"grad_norm": 4.662119388580322,
"learning_rate": 5e-06,
"loss": 1.3197,
"mean_token_accuracy": 0.6590218544006348,
"num_tokens": 4898468.0,
"step": 376
},
{
"epoch": 0.24128,
"grad_norm": 5.356217861175537,
"learning_rate": 5e-06,
"loss": 1.3352,
"mean_token_accuracy": 0.6680933758616447,
"num_tokens": 4910094.0,
"step": 377
},
{
"epoch": 0.24192,
"grad_norm": 3.6107497215270996,
"learning_rate": 5e-06,
"loss": 1.1964,
"mean_token_accuracy": 0.6513196639716625,
"num_tokens": 4924004.0,
"step": 378
},
{
"epoch": 0.24256,
"grad_norm": 3.8560822010040283,
"learning_rate": 5e-06,
"loss": 1.4503,
"mean_token_accuracy": 0.6257938891649246,
"num_tokens": 4937000.0,
"step": 379
},
{
"epoch": 0.2432,
"grad_norm": 3.5278120040893555,
"learning_rate": 5e-06,
"loss": 1.5268,
"mean_token_accuracy": 0.6288462430238724,
"num_tokens": 4951330.0,
"step": 380
},
{
"epoch": 0.24384,
"grad_norm": 3.4525208473205566,
"learning_rate": 5e-06,
"loss": 1.2486,
"mean_token_accuracy": 0.6658232286572456,
"num_tokens": 4966074.0,
"step": 381
},
{
"epoch": 0.24448,
"grad_norm": 3.9059042930603027,
"learning_rate": 5e-06,
"loss": 1.3658,
"mean_token_accuracy": 0.6580025032162666,
"num_tokens": 4977899.0,
"step": 382
},
{
"epoch": 0.24512,
"grad_norm": 3.895254135131836,
"learning_rate": 5e-06,
"loss": 1.2906,
"mean_token_accuracy": 0.659791849553585,
"num_tokens": 4990481.0,
"step": 383
},
{
"epoch": 0.24576,
"grad_norm": 3.6709907054901123,
"learning_rate": 5e-06,
"loss": 1.243,
"mean_token_accuracy": 0.6784983575344086,
"num_tokens": 5003987.0,
"step": 384
},
{
"epoch": 0.2464,
"grad_norm": 3.8411707878112793,
"learning_rate": 5e-06,
"loss": 1.3464,
"mean_token_accuracy": 0.6375136002898216,
"num_tokens": 5015850.0,
"step": 385
},
{
"epoch": 0.24704,
"grad_norm": 4.552581787109375,
"learning_rate": 5e-06,
"loss": 1.5245,
"mean_token_accuracy": 0.6128373965620995,
"num_tokens": 5026044.0,
"step": 386
},
{
"epoch": 0.24768,
"grad_norm": 3.8649439811706543,
"learning_rate": 5e-06,
"loss": 1.4339,
"mean_token_accuracy": 0.6444417163729668,
"num_tokens": 5039126.0,
"step": 387
},
{
"epoch": 0.24832,
"grad_norm": 4.057676315307617,
"learning_rate": 5e-06,
"loss": 1.352,
"mean_token_accuracy": 0.6415472850203514,
"num_tokens": 5051247.0,
"step": 388
},
{
"epoch": 0.24896,
"grad_norm": 4.093824863433838,
"learning_rate": 5e-06,
"loss": 1.3294,
"mean_token_accuracy": 0.6551511734724045,
"num_tokens": 5066290.0,
"step": 389
},
{
"epoch": 0.2496,
"grad_norm": 3.478832244873047,
"learning_rate": 5e-06,
"loss": 1.2387,
"mean_token_accuracy": 0.6599762067198753,
"num_tokens": 5079904.0,
"step": 390
},
{
"epoch": 0.25024,
"grad_norm": 3.4885847568511963,
"learning_rate": 5e-06,
"loss": 1.2518,
"mean_token_accuracy": 0.6731147542595863,
"num_tokens": 5093663.0,
"step": 391
},
{
"epoch": 0.25088,
"grad_norm": 3.4742021560668945,
"learning_rate": 5e-06,
"loss": 1.1443,
"mean_token_accuracy": 0.7067101299762726,
"num_tokens": 5108375.0,
"step": 392
},
{
"epoch": 0.25152,
"grad_norm": 3.459711790084839,
"learning_rate": 5e-06,
"loss": 1.2421,
"mean_token_accuracy": 0.677531287074089,
"num_tokens": 5121820.0,
"step": 393
},
{
"epoch": 0.25216,
"grad_norm": 3.607994794845581,
"learning_rate": 5e-06,
"loss": 1.7177,
"mean_token_accuracy": 0.5983672738075256,
"num_tokens": 5136753.0,
"step": 394
},
{
"epoch": 0.2528,
"grad_norm": 3.9843177795410156,
"learning_rate": 5e-06,
"loss": 1.2271,
"mean_token_accuracy": 0.6913007572293282,
"num_tokens": 5148997.0,
"step": 395
},
{
"epoch": 0.25344,
"grad_norm": 3.323129177093506,
"learning_rate": 5e-06,
"loss": 1.4278,
"mean_token_accuracy": 0.6322130486369133,
"num_tokens": 5163285.0,
"step": 396
},
{
"epoch": 0.25408,
"grad_norm": 4.542083740234375,
"learning_rate": 5e-06,
"loss": 1.3214,
"mean_token_accuracy": 0.676998108625412,
"num_tokens": 5174430.0,
"step": 397
},
{
"epoch": 0.25472,
"grad_norm": 3.523313045501709,
"learning_rate": 5e-06,
"loss": 1.5198,
"mean_token_accuracy": 0.6127360239624977,
"num_tokens": 5188411.0,
"step": 398
},
{
"epoch": 0.25536,
"grad_norm": 3.990492820739746,
"learning_rate": 5e-06,
"loss": 1.4177,
"mean_token_accuracy": 0.6671818047761917,
"num_tokens": 5199067.0,
"step": 399
},
{
"epoch": 0.256,
"grad_norm": 3.5755157470703125,
"learning_rate": 5e-06,
"loss": 1.3593,
"mean_token_accuracy": 0.6605222076177597,
"num_tokens": 5212285.0,
"step": 400
},
{
"epoch": 0.25664,
"grad_norm": 3.8733558654785156,
"learning_rate": 5e-06,
"loss": 1.2535,
"mean_token_accuracy": 0.6705236658453941,
"num_tokens": 5224693.0,
"step": 401
},
{
"epoch": 0.25728,
"grad_norm": 3.86195707321167,
"learning_rate": 5e-06,
"loss": 1.0007,
"mean_token_accuracy": 0.7054353207349777,
"num_tokens": 5235503.0,
"step": 402
},
{
"epoch": 0.25792,
"grad_norm": 4.819467067718506,
"learning_rate": 5e-06,
"loss": 1.2544,
"mean_token_accuracy": 0.6714291796088219,
"num_tokens": 5244676.0,
"step": 403
},
{
"epoch": 0.25856,
"grad_norm": 4.117583274841309,
"learning_rate": 5e-06,
"loss": 1.4205,
"mean_token_accuracy": 0.6371640935540199,
"num_tokens": 5259342.0,
"step": 404
},
{
"epoch": 0.2592,
"grad_norm": 3.8214738368988037,
"learning_rate": 5e-06,
"loss": 1.2089,
"mean_token_accuracy": 0.682219110429287,
"num_tokens": 5271812.0,
"step": 405
},
{
"epoch": 0.25984,
"grad_norm": 4.264610290527344,
"learning_rate": 5e-06,
"loss": 1.2525,
"mean_token_accuracy": 0.6648172214627266,
"num_tokens": 5285329.0,
"step": 406
},
{
"epoch": 0.26048,
"grad_norm": 3.759557008743286,
"learning_rate": 5e-06,
"loss": 1.2922,
"mean_token_accuracy": 0.6575791016221046,
"num_tokens": 5298290.0,
"step": 407
},
{
"epoch": 0.26112,
"grad_norm": 5.103738784790039,
"learning_rate": 5e-06,
"loss": 1.3045,
"mean_token_accuracy": 0.6531935781240463,
"num_tokens": 5313458.0,
"step": 408
},
{
"epoch": 0.26176,
"grad_norm": 4.379658222198486,
"learning_rate": 5e-06,
"loss": 1.2592,
"mean_token_accuracy": 0.6718562245368958,
"num_tokens": 5324820.0,
"step": 409
},
{
"epoch": 0.2624,
"grad_norm": 3.613741636276245,
"learning_rate": 5e-06,
"loss": 1.3052,
"mean_token_accuracy": 0.6661521196365356,
"num_tokens": 5340445.0,
"step": 410
},
{
"epoch": 0.26304,
"grad_norm": 3.643263578414917,
"learning_rate": 5e-06,
"loss": 1.4656,
"mean_token_accuracy": 0.6501626446843147,
"num_tokens": 5353074.0,
"step": 411
},
{
"epoch": 0.26368,
"grad_norm": 3.359731912612915,
"learning_rate": 5e-06,
"loss": 1.1761,
"mean_token_accuracy": 0.6895303055644035,
"num_tokens": 5367294.0,
"step": 412
},
{
"epoch": 0.26432,
"grad_norm": 4.145616054534912,
"learning_rate": 5e-06,
"loss": 1.3095,
"mean_token_accuracy": 0.6614864692091942,
"num_tokens": 5378260.0,
"step": 413
},
{
"epoch": 0.26496,
"grad_norm": 4.191911697387695,
"learning_rate": 5e-06,
"loss": 1.4995,
"mean_token_accuracy": 0.6586913987994194,
"num_tokens": 5390393.0,
"step": 414
},
{
"epoch": 0.2656,
"grad_norm": 3.9197440147399902,
"learning_rate": 5e-06,
"loss": 1.2837,
"mean_token_accuracy": 0.649936854839325,
"num_tokens": 5404355.0,
"step": 415
},
{
"epoch": 0.26624,
"grad_norm": 3.791869640350342,
"learning_rate": 5e-06,
"loss": 1.3852,
"mean_token_accuracy": 0.6447301283478737,
"num_tokens": 5418228.0,
"step": 416
},
{
"epoch": 0.26688,
"grad_norm": 3.4961142539978027,
"learning_rate": 5e-06,
"loss": 1.2148,
"mean_token_accuracy": 0.6757354438304901,
"num_tokens": 5431981.0,
"step": 417
},
{
"epoch": 0.26752,
"grad_norm": 3.777859687805176,
"learning_rate": 5e-06,
"loss": 1.2461,
"mean_token_accuracy": 0.6744889244437218,
"num_tokens": 5446699.0,
"step": 418
},
{
"epoch": 0.26816,
"grad_norm": 4.008702754974365,
"learning_rate": 5e-06,
"loss": 1.4865,
"mean_token_accuracy": 0.628353901207447,
"num_tokens": 5459373.0,
"step": 419
},
{
"epoch": 0.2688,
"grad_norm": 3.69231915473938,
"learning_rate": 5e-06,
"loss": 1.418,
"mean_token_accuracy": 0.6555268168449402,
"num_tokens": 5473223.0,
"step": 420
},
{
"epoch": 0.26944,
"grad_norm": 3.597212314605713,
"learning_rate": 5e-06,
"loss": 1.4661,
"mean_token_accuracy": 0.6289801895618439,
"num_tokens": 5487849.0,
"step": 421
},
{
"epoch": 0.27008,
"grad_norm": 3.8283562660217285,
"learning_rate": 5e-06,
"loss": 1.2745,
"mean_token_accuracy": 0.6649068146944046,
"num_tokens": 5499444.0,
"step": 422
},
{
"epoch": 0.27072,
"grad_norm": 3.896993398666382,
"learning_rate": 5e-06,
"loss": 1.3163,
"mean_token_accuracy": 0.6707305237650871,
"num_tokens": 5512030.0,
"step": 423
},
{
"epoch": 0.27136,
"grad_norm": 3.609224557876587,
"learning_rate": 5e-06,
"loss": 1.2482,
"mean_token_accuracy": 0.6678152307868004,
"num_tokens": 5525475.0,
"step": 424
},
{
"epoch": 0.272,
"grad_norm": 3.715836763381958,
"learning_rate": 5e-06,
"loss": 1.1806,
"mean_token_accuracy": 0.6893536150455475,
"num_tokens": 5536706.0,
"step": 425
},
{
"epoch": 0.27264,
"grad_norm": 4.006832599639893,
"learning_rate": 5e-06,
"loss": 1.3409,
"mean_token_accuracy": 0.6921984776854515,
"num_tokens": 5551024.0,
"step": 426
},
{
"epoch": 0.27328,
"grad_norm": 3.5625905990600586,
"learning_rate": 5e-06,
"loss": 1.3058,
"mean_token_accuracy": 0.6508674696087837,
"num_tokens": 5566008.0,
"step": 427
},
{
"epoch": 0.27392,
"grad_norm": 3.7165002822875977,
"learning_rate": 5e-06,
"loss": 1.4402,
"mean_token_accuracy": 0.6360224187374115,
"num_tokens": 5579661.0,
"step": 428
},
{
"epoch": 0.27456,
"grad_norm": 3.702185869216919,
"learning_rate": 5e-06,
"loss": 1.3791,
"mean_token_accuracy": 0.6388497278094292,
"num_tokens": 5593091.0,
"step": 429
},
{
"epoch": 0.2752,
"grad_norm": 3.397646188735962,
"learning_rate": 5e-06,
"loss": 1.4501,
"mean_token_accuracy": 0.6443121284246445,
"num_tokens": 5607808.0,
"step": 430
},
{
"epoch": 0.27584,
"grad_norm": 4.425196170806885,
"learning_rate": 5e-06,
"loss": 1.2816,
"mean_token_accuracy": 0.646281823515892,
"num_tokens": 5619010.0,
"step": 431
},
{
"epoch": 0.27648,
"grad_norm": 3.7968697547912598,
"learning_rate": 5e-06,
"loss": 1.4615,
"mean_token_accuracy": 0.6492092609405518,
"num_tokens": 5634182.0,
"step": 432
},
{
"epoch": 0.27712,
"grad_norm": 3.3441648483276367,
"learning_rate": 5e-06,
"loss": 1.4186,
"mean_token_accuracy": 0.6269052773714066,
"num_tokens": 5647759.0,
"step": 433
},
{
"epoch": 0.27776,
"grad_norm": 3.4352946281433105,
"learning_rate": 5e-06,
"loss": 1.3009,
"mean_token_accuracy": 0.666948527097702,
"num_tokens": 5662089.0,
"step": 434
},
{
"epoch": 0.2784,
"grad_norm": 3.8102269172668457,
"learning_rate": 5e-06,
"loss": 1.3815,
"mean_token_accuracy": 0.6399514004588127,
"num_tokens": 5674982.0,
"step": 435
},
{
"epoch": 0.27904,
"grad_norm": 3.747889995574951,
"learning_rate": 5e-06,
"loss": 1.5238,
"mean_token_accuracy": 0.6195821687579155,
"num_tokens": 5687944.0,
"step": 436
},
{
"epoch": 0.27968,
"grad_norm": 3.963461399078369,
"learning_rate": 5e-06,
"loss": 1.2936,
"mean_token_accuracy": 0.6649496257305145,
"num_tokens": 5699002.0,
"step": 437
},
{
"epoch": 0.28032,
"grad_norm": 3.3493547439575195,
"learning_rate": 5e-06,
"loss": 1.346,
"mean_token_accuracy": 0.6441225036978722,
"num_tokens": 5712275.0,
"step": 438
},
{
"epoch": 0.28096,
"grad_norm": 3.779747247695923,
"learning_rate": 5e-06,
"loss": 1.5187,
"mean_token_accuracy": 0.6183040626347065,
"num_tokens": 5726089.0,
"step": 439
},
{
"epoch": 0.2816,
"grad_norm": 3.8327977657318115,
"learning_rate": 5e-06,
"loss": 1.301,
"mean_token_accuracy": 0.6831925585865974,
"num_tokens": 5737061.0,
"step": 440
},
{
"epoch": 0.28224,
"grad_norm": 3.559340476989746,
"learning_rate": 5e-06,
"loss": 1.286,
"mean_token_accuracy": 0.667030468583107,
"num_tokens": 5751187.0,
"step": 441
},
{
"epoch": 0.28288,
"grad_norm": 3.395509719848633,
"learning_rate": 5e-06,
"loss": 1.1954,
"mean_token_accuracy": 0.6770320907235146,
"num_tokens": 5765243.0,
"step": 442
},
{
"epoch": 0.28352,
"grad_norm": 4.320680618286133,
"learning_rate": 5e-06,
"loss": 1.4825,
"mean_token_accuracy": 0.6353371068835258,
"num_tokens": 5776809.0,
"step": 443
},
{
"epoch": 0.28416,
"grad_norm": 4.229187488555908,
"learning_rate": 5e-06,
"loss": 1.3178,
"mean_token_accuracy": 0.6752159968018532,
"num_tokens": 5788234.0,
"step": 444
},
{
"epoch": 0.2848,
"grad_norm": 3.9184088706970215,
"learning_rate": 5e-06,
"loss": 1.2316,
"mean_token_accuracy": 0.6824081540107727,
"num_tokens": 5799793.0,
"step": 445
},
{
"epoch": 0.28544,
"grad_norm": 4.083866596221924,
"learning_rate": 5e-06,
"loss": 1.4558,
"mean_token_accuracy": 0.6533151641488075,
"num_tokens": 5812228.0,
"step": 446
},
{
"epoch": 0.28608,
"grad_norm": 4.136886119842529,
"learning_rate": 5e-06,
"loss": 1.2802,
"mean_token_accuracy": 0.6800569593906403,
"num_tokens": 5822937.0,
"step": 447
},
{
"epoch": 0.28672,
"grad_norm": 3.92091965675354,
"learning_rate": 5e-06,
"loss": 1.253,
"mean_token_accuracy": 0.6916609779000282,
"num_tokens": 5835964.0,
"step": 448
},
{
"epoch": 0.28736,
"grad_norm": 3.784158706665039,
"learning_rate": 5e-06,
"loss": 1.2972,
"mean_token_accuracy": 0.674082837998867,
"num_tokens": 5847766.0,
"step": 449
},
{
"epoch": 0.288,
"grad_norm": 4.055779933929443,
"learning_rate": 5e-06,
"loss": 1.5458,
"mean_token_accuracy": 0.6409785822033882,
"num_tokens": 5860297.0,
"step": 450
},
{
"epoch": 0.28864,
"grad_norm": 4.014561176300049,
"learning_rate": 5e-06,
"loss": 1.239,
"mean_token_accuracy": 0.6951504573225975,
"num_tokens": 5871881.0,
"step": 451
},
{
"epoch": 0.28928,
"grad_norm": 3.908066987991333,
"learning_rate": 5e-06,
"loss": 1.1405,
"mean_token_accuracy": 0.6899219900369644,
"num_tokens": 5883233.0,
"step": 452
},
{
"epoch": 0.28992,
"grad_norm": 3.5451455116271973,
"learning_rate": 5e-06,
"loss": 1.4424,
"mean_token_accuracy": 0.6282008588314056,
"num_tokens": 5897342.0,
"step": 453
},
{
"epoch": 0.29056,
"grad_norm": 3.9957897663116455,
"learning_rate": 5e-06,
"loss": 1.2495,
"mean_token_accuracy": 0.6782330796122551,
"num_tokens": 5909429.0,
"step": 454
},
{
"epoch": 0.2912,
"grad_norm": 3.5935301780700684,
"learning_rate": 5e-06,
"loss": 1.292,
"mean_token_accuracy": 0.6655653864145279,
"num_tokens": 5923163.0,
"step": 455
},
{
"epoch": 0.29184,
"grad_norm": 3.677741765975952,
"learning_rate": 5e-06,
"loss": 1.3873,
"mean_token_accuracy": 0.6691123694181442,
"num_tokens": 5938353.0,
"step": 456
},
{
"epoch": 0.29248,
"grad_norm": 3.7560808658599854,
"learning_rate": 5e-06,
"loss": 1.2574,
"mean_token_accuracy": 0.6840595826506615,
"num_tokens": 5950566.0,
"step": 457
},
{
"epoch": 0.29312,
"grad_norm": 4.219088077545166,
"learning_rate": 5e-06,
"loss": 1.4552,
"mean_token_accuracy": 0.6695370376110077,
"num_tokens": 5963475.0,
"step": 458
},
{
"epoch": 0.29376,
"grad_norm": 4.02653169631958,
"learning_rate": 5e-06,
"loss": 1.3478,
"mean_token_accuracy": 0.6490977182984352,
"num_tokens": 5974934.0,
"step": 459
},
{
"epoch": 0.2944,
"grad_norm": 3.8300678730010986,
"learning_rate": 5e-06,
"loss": 1.2826,
"mean_token_accuracy": 0.64987413585186,
"num_tokens": 5987250.0,
"step": 460
},
{
"epoch": 0.29504,
"grad_norm": 3.818307876586914,
"learning_rate": 5e-06,
"loss": 1.1296,
"mean_token_accuracy": 0.6943321749567986,
"num_tokens": 5999834.0,
"step": 461
},
{
"epoch": 0.29568,
"grad_norm": 3.6047048568725586,
"learning_rate": 5e-06,
"loss": 1.168,
"mean_token_accuracy": 0.6777333468198776,
"num_tokens": 6012454.0,
"step": 462
},
{
"epoch": 0.29632,
"grad_norm": 3.4579696655273438,
"learning_rate": 5e-06,
"loss": 1.4147,
"mean_token_accuracy": 0.6355468481779099,
"num_tokens": 6026563.0,
"step": 463
},
{
"epoch": 0.29696,
"grad_norm": 4.736328125,
"learning_rate": 5e-06,
"loss": 1.3852,
"mean_token_accuracy": 0.6531487628817558,
"num_tokens": 6037687.0,
"step": 464
},
{
"epoch": 0.2976,
"grad_norm": 5.537712574005127,
"learning_rate": 5e-06,
"loss": 1.3899,
"mean_token_accuracy": 0.6446737200021744,
"num_tokens": 6052621.0,
"step": 465
},
{
"epoch": 0.29824,
"grad_norm": 4.118095397949219,
"learning_rate": 5e-06,
"loss": 1.4534,
"mean_token_accuracy": 0.6429826766252518,
"num_tokens": 6065725.0,
"step": 466
},
{
"epoch": 0.29888,
"grad_norm": 3.415851354598999,
"learning_rate": 5e-06,
"loss": 1.1162,
"mean_token_accuracy": 0.6864155679941177,
"num_tokens": 6080168.0,
"step": 467
},
{
"epoch": 0.29952,
"grad_norm": 3.098151922225952,
"learning_rate": 5e-06,
"loss": 1.3817,
"mean_token_accuracy": 0.6485566720366478,
"num_tokens": 6096552.0,
"step": 468
},
{
"epoch": 0.30016,
"grad_norm": 4.419194221496582,
"learning_rate": 5e-06,
"loss": 1.2934,
"mean_token_accuracy": 0.6655605882406235,
"num_tokens": 6107311.0,
"step": 469
},
{
"epoch": 0.3008,
"grad_norm": 2.9706687927246094,
"learning_rate": 5e-06,
"loss": 1.4043,
"mean_token_accuracy": 0.6342752501368523,
"num_tokens": 6123233.0,
"step": 470
},
{
"epoch": 0.30144,
"grad_norm": 4.0415940284729,
"learning_rate": 5e-06,
"loss": 1.4607,
"mean_token_accuracy": 0.6415122263133526,
"num_tokens": 6133347.0,
"step": 471
},
{
"epoch": 0.30208,
"grad_norm": 3.6789848804473877,
"learning_rate": 5e-06,
"loss": 1.359,
"mean_token_accuracy": 0.6488511562347412,
"num_tokens": 6147619.0,
"step": 472
},
{
"epoch": 0.30272,
"grad_norm": 3.8090357780456543,
"learning_rate": 5e-06,
"loss": 1.3368,
"mean_token_accuracy": 0.638521321117878,
"num_tokens": 6159564.0,
"step": 473
},
{
"epoch": 0.30336,
"grad_norm": 3.4183847904205322,
"learning_rate": 5e-06,
"loss": 1.428,
"mean_token_accuracy": 0.6213861741125584,
"num_tokens": 6173025.0,
"step": 474
},
{
"epoch": 0.304,
"grad_norm": 3.822892427444458,
"learning_rate": 5e-06,
"loss": 1.0865,
"mean_token_accuracy": 0.6979233846068382,
"num_tokens": 6185966.0,
"step": 475
},
{
"epoch": 0.30464,
"grad_norm": 3.686979293823242,
"learning_rate": 5e-06,
"loss": 1.2195,
"mean_token_accuracy": 0.6794964447617531,
"num_tokens": 6198680.0,
"step": 476
},
{
"epoch": 0.30528,
"grad_norm": 3.797368049621582,
"learning_rate": 5e-06,
"loss": 1.3273,
"mean_token_accuracy": 0.6465971991419792,
"num_tokens": 6212513.0,
"step": 477
},
{
"epoch": 0.30592,
"grad_norm": 3.9698474407196045,
"learning_rate": 5e-06,
"loss": 1.3636,
"mean_token_accuracy": 0.6398535817861557,
"num_tokens": 6224341.0,
"step": 478
},
{
"epoch": 0.30656,
"grad_norm": 3.755352258682251,
"learning_rate": 5e-06,
"loss": 1.2421,
"mean_token_accuracy": 0.6844679713249207,
"num_tokens": 6236749.0,
"step": 479
},
{
"epoch": 0.3072,
"grad_norm": 3.6229302883148193,
"learning_rate": 5e-06,
"loss": 1.1653,
"mean_token_accuracy": 0.7136622071266174,
"num_tokens": 6250516.0,
"step": 480
},
{
"epoch": 0.30784,
"grad_norm": 4.006715774536133,
"learning_rate": 5e-06,
"loss": 1.0948,
"mean_token_accuracy": 0.6872854009270668,
"num_tokens": 6264775.0,
"step": 481
},
{
"epoch": 0.30848,
"grad_norm": 3.036703586578369,
"learning_rate": 5e-06,
"loss": 1.4079,
"mean_token_accuracy": 0.6424620673060417,
"num_tokens": 6280727.0,
"step": 482
},
{
"epoch": 0.30912,
"grad_norm": 3.861215114593506,
"learning_rate": 5e-06,
"loss": 1.5746,
"mean_token_accuracy": 0.6214606538414955,
"num_tokens": 6294874.0,
"step": 483
},
{
"epoch": 0.30976,
"grad_norm": 3.6067492961883545,
"learning_rate": 5e-06,
"loss": 1.2666,
"mean_token_accuracy": 0.6533055976033211,
"num_tokens": 6308075.0,
"step": 484
},
{
"epoch": 0.3104,
"grad_norm": 3.8777058124542236,
"learning_rate": 5e-06,
"loss": 1.4112,
"mean_token_accuracy": 0.6393994837999344,
"num_tokens": 6319998.0,
"step": 485
},
{
"epoch": 0.31104,
"grad_norm": 3.640782594680786,
"learning_rate": 5e-06,
"loss": 1.3312,
"mean_token_accuracy": 0.6485870778560638,
"num_tokens": 6332589.0,
"step": 486
},
{
"epoch": 0.31168,
"grad_norm": 3.792318344116211,
"learning_rate": 5e-06,
"loss": 1.4024,
"mean_token_accuracy": 0.6588046550750732,
"num_tokens": 6345037.0,
"step": 487
},
{
"epoch": 0.31232,
"grad_norm": 3.5393240451812744,
"learning_rate": 5e-06,
"loss": 1.3419,
"mean_token_accuracy": 0.6457289680838585,
"num_tokens": 6359457.0,
"step": 488
},
{
"epoch": 0.31296,
"grad_norm": 3.974876642227173,
"learning_rate": 5e-06,
"loss": 1.1741,
"mean_token_accuracy": 0.6610330641269684,
"num_tokens": 6370983.0,
"step": 489
},
{
"epoch": 0.3136,
"grad_norm": 3.6941604614257812,
"learning_rate": 5e-06,
"loss": 1.2241,
"mean_token_accuracy": 0.686374232172966,
"num_tokens": 6384168.0,
"step": 490
},
{
"epoch": 0.31424,
"grad_norm": 4.212184906005859,
"learning_rate": 5e-06,
"loss": 1.4216,
"mean_token_accuracy": 0.6811521798372269,
"num_tokens": 6395881.0,
"step": 491
},
{
"epoch": 0.31488,
"grad_norm": 3.925226926803589,
"learning_rate": 5e-06,
"loss": 1.3582,
"mean_token_accuracy": 0.6342300400137901,
"num_tokens": 6409919.0,
"step": 492
},
{
"epoch": 0.31552,
"grad_norm": 3.9599673748016357,
"learning_rate": 5e-06,
"loss": 1.3918,
"mean_token_accuracy": 0.6489474773406982,
"num_tokens": 6422097.0,
"step": 493
},
{
"epoch": 0.31616,
"grad_norm": 3.42258358001709,
"learning_rate": 5e-06,
"loss": 1.1146,
"mean_token_accuracy": 0.6913427859544754,
"num_tokens": 6435173.0,
"step": 494
},
{
"epoch": 0.3168,
"grad_norm": 4.284220218658447,
"learning_rate": 5e-06,
"loss": 1.4547,
"mean_token_accuracy": 0.6440516263246536,
"num_tokens": 6447321.0,
"step": 495
},
{
"epoch": 0.31744,
"grad_norm": 3.7218246459960938,
"learning_rate": 5e-06,
"loss": 1.3806,
"mean_token_accuracy": 0.629929706454277,
"num_tokens": 6460270.0,
"step": 496
},
{
"epoch": 0.31808,
"grad_norm": 3.406933546066284,
"learning_rate": 5e-06,
"loss": 1.1401,
"mean_token_accuracy": 0.67889504134655,
"num_tokens": 6473352.0,
"step": 497
},
{
"epoch": 0.31872,
"grad_norm": 3.2584404945373535,
"learning_rate": 5e-06,
"loss": 1.2213,
"mean_token_accuracy": 0.6629000529646873,
"num_tokens": 6487623.0,
"step": 498
},
{
"epoch": 0.31936,
"grad_norm": 4.134445667266846,
"learning_rate": 5e-06,
"loss": 1.1397,
"mean_token_accuracy": 0.6819293051958084,
"num_tokens": 6499986.0,
"step": 499
},
{
"epoch": 0.32,
"grad_norm": 4.104599475860596,
"learning_rate": 5e-06,
"loss": 1.4537,
"mean_token_accuracy": 0.6394720375537872,
"num_tokens": 6512724.0,
"step": 500
},
{
"epoch": 0.32064,
"grad_norm": 3.4379241466522217,
"learning_rate": 5e-06,
"loss": 1.3714,
"mean_token_accuracy": 0.6542030349373817,
"num_tokens": 6526727.0,
"step": 501
},
{
"epoch": 0.32128,
"grad_norm": 3.4537572860717773,
"learning_rate": 5e-06,
"loss": 1.3053,
"mean_token_accuracy": 0.6661063358187675,
"num_tokens": 6542663.0,
"step": 502
},
{
"epoch": 0.32192,
"grad_norm": 3.5106639862060547,
"learning_rate": 5e-06,
"loss": 1.3694,
"mean_token_accuracy": 0.6464217305183411,
"num_tokens": 6557847.0,
"step": 503
},
{
"epoch": 0.32256,
"grad_norm": 3.924419641494751,
"learning_rate": 5e-06,
"loss": 1.2544,
"mean_token_accuracy": 0.6603437811136246,
"num_tokens": 6570515.0,
"step": 504
},
{
"epoch": 0.3232,
"grad_norm": 3.385101318359375,
"learning_rate": 5e-06,
"loss": 1.4872,
"mean_token_accuracy": 0.6421084851026535,
"num_tokens": 6584786.0,
"step": 505
},
{
"epoch": 0.32384,
"grad_norm": 3.378535032272339,
"learning_rate": 5e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.6894903257489204,
"num_tokens": 6598436.0,
"step": 506
},
{
"epoch": 0.32448,
"grad_norm": 4.74169397354126,
"learning_rate": 5e-06,
"loss": 1.4346,
"mean_token_accuracy": 0.6547307670116425,
"num_tokens": 6610436.0,
"step": 507
},
{
"epoch": 0.32512,
"grad_norm": 3.473893165588379,
"learning_rate": 5e-06,
"loss": 1.3827,
"mean_token_accuracy": 0.6525059714913368,
"num_tokens": 6626032.0,
"step": 508
},
{
"epoch": 0.32576,
"grad_norm": 4.2575273513793945,
"learning_rate": 5e-06,
"loss": 1.4238,
"mean_token_accuracy": 0.6410808116197586,
"num_tokens": 6637760.0,
"step": 509
},
{
"epoch": 0.3264,
"grad_norm": 3.5705769062042236,
"learning_rate": 5e-06,
"loss": 1.2437,
"mean_token_accuracy": 0.6628687754273415,
"num_tokens": 6652912.0,
"step": 510
},
{
"epoch": 0.32704,
"grad_norm": 4.5391011238098145,
"learning_rate": 5e-06,
"loss": 1.292,
"mean_token_accuracy": 0.6497529372572899,
"num_tokens": 6664537.0,
"step": 511
},
{
"epoch": 0.32768,
"grad_norm": 4.8541789054870605,
"learning_rate": 5e-06,
"loss": 1.3102,
"mean_token_accuracy": 0.6663089245557785,
"num_tokens": 6674380.0,
"step": 512
},
{
"epoch": 0.32832,
"grad_norm": 4.8184332847595215,
"learning_rate": 5e-06,
"loss": 1.3336,
"mean_token_accuracy": 0.6582650914788246,
"num_tokens": 6686208.0,
"step": 513
},
{
"epoch": 0.32896,
"grad_norm": 3.9492416381835938,
"learning_rate": 5e-06,
"loss": 1.1609,
"mean_token_accuracy": 0.6893665343523026,
"num_tokens": 6698802.0,
"step": 514
},
{
"epoch": 0.3296,
"grad_norm": 3.2947769165039062,
"learning_rate": 5e-06,
"loss": 1.3108,
"mean_token_accuracy": 0.6393668726086617,
"num_tokens": 6714353.0,
"step": 515
},
{
"epoch": 0.33024,
"grad_norm": 3.5826685428619385,
"learning_rate": 5e-06,
"loss": 1.2592,
"mean_token_accuracy": 0.663002572953701,
"num_tokens": 6728955.0,
"step": 516
},
{
"epoch": 0.33088,
"grad_norm": 6.832690238952637,
"learning_rate": 5e-06,
"loss": 1.2339,
"mean_token_accuracy": 0.6615518927574158,
"num_tokens": 6742119.0,
"step": 517
},
{
"epoch": 0.33152,
"grad_norm": 3.935009479522705,
"learning_rate": 5e-06,
"loss": 1.2767,
"mean_token_accuracy": 0.6649063900113106,
"num_tokens": 6754185.0,
"step": 518
},
{
"epoch": 0.33216,
"grad_norm": 4.145579814910889,
"learning_rate": 5e-06,
"loss": 1.5135,
"mean_token_accuracy": 0.6258162558078766,
"num_tokens": 6765367.0,
"step": 519
},
{
"epoch": 0.3328,
"grad_norm": 3.592618227005005,
"learning_rate": 5e-06,
"loss": 1.4578,
"mean_token_accuracy": 0.623950220644474,
"num_tokens": 6778122.0,
"step": 520
},
{
"epoch": 0.33344,
"grad_norm": 5.795764923095703,
"learning_rate": 5e-06,
"loss": 1.4626,
"mean_token_accuracy": 0.6486967876553535,
"num_tokens": 6790660.0,
"step": 521
},
{
"epoch": 0.33408,
"grad_norm": 4.278341293334961,
"learning_rate": 5e-06,
"loss": 1.1897,
"mean_token_accuracy": 0.6738953441381454,
"num_tokens": 6802594.0,
"step": 522
},
{
"epoch": 0.33472,
"grad_norm": 4.899449825286865,
"learning_rate": 5e-06,
"loss": 1.3754,
"mean_token_accuracy": 0.6378564760088921,
"num_tokens": 6818415.0,
"step": 523
},
{
"epoch": 0.33536,
"grad_norm": 4.733186721801758,
"learning_rate": 5e-06,
"loss": 1.36,
"mean_token_accuracy": 0.6564139500260353,
"num_tokens": 6831504.0,
"step": 524
},
{
"epoch": 0.336,
"grad_norm": 3.7966043949127197,
"learning_rate": 5e-06,
"loss": 1.2975,
"mean_token_accuracy": 0.6624530181288719,
"num_tokens": 6843279.0,
"step": 525
},
{
"epoch": 0.33664,
"grad_norm": 5.124260425567627,
"learning_rate": 5e-06,
"loss": 1.5377,
"mean_token_accuracy": 0.6152323558926582,
"num_tokens": 6854556.0,
"step": 526
},
{
"epoch": 0.33728,
"grad_norm": 4.210925579071045,
"learning_rate": 5e-06,
"loss": 1.3342,
"mean_token_accuracy": 0.6440554708242416,
"num_tokens": 6867858.0,
"step": 527
},
{
"epoch": 0.33792,
"grad_norm": 3.751556873321533,
"learning_rate": 5e-06,
"loss": 1.4675,
"mean_token_accuracy": 0.6253782510757446,
"num_tokens": 6881065.0,
"step": 528
},
{
"epoch": 0.33856,
"grad_norm": 6.117438793182373,
"learning_rate": 5e-06,
"loss": 1.356,
"mean_token_accuracy": 0.6668061912059784,
"num_tokens": 6894733.0,
"step": 529
},
{
"epoch": 0.3392,
"grad_norm": 3.5207901000976562,
"learning_rate": 5e-06,
"loss": 1.2676,
"mean_token_accuracy": 0.6576649472117424,
"num_tokens": 6907851.0,
"step": 530
},
{
"epoch": 0.33984,
"grad_norm": 3.6760780811309814,
"learning_rate": 5e-06,
"loss": 1.1176,
"mean_token_accuracy": 0.6879062727093697,
"num_tokens": 6921679.0,
"step": 531
},
{
"epoch": 0.34048,
"grad_norm": 4.656152725219727,
"learning_rate": 5e-06,
"loss": 1.42,
"mean_token_accuracy": 0.642581582069397,
"num_tokens": 6935677.0,
"step": 532
},
{
"epoch": 0.34112,
"grad_norm": 5.187691688537598,
"learning_rate": 5e-06,
"loss": 1.4074,
"mean_token_accuracy": 0.668558657169342,
"num_tokens": 6945440.0,
"step": 533
},
{
"epoch": 0.34176,
"grad_norm": 7.5727949142456055,
"learning_rate": 5e-06,
"loss": 1.4775,
"mean_token_accuracy": 0.6168788969516754,
"num_tokens": 6959496.0,
"step": 534
},
{
"epoch": 0.3424,
"grad_norm": 3.821122646331787,
"learning_rate": 5e-06,
"loss": 1.1118,
"mean_token_accuracy": 0.6967110335826874,
"num_tokens": 6970897.0,
"step": 535
},
{
"epoch": 0.34304,
"grad_norm": 3.28977108001709,
"learning_rate": 5e-06,
"loss": 1.2668,
"mean_token_accuracy": 0.6639266163110733,
"num_tokens": 6986271.0,
"step": 536
},
{
"epoch": 0.34368,
"grad_norm": 4.031164646148682,
"learning_rate": 5e-06,
"loss": 1.2953,
"mean_token_accuracy": 0.6541831567883492,
"num_tokens": 6998841.0,
"step": 537
},
{
"epoch": 0.34432,
"grad_norm": 5.315206527709961,
"learning_rate": 5e-06,
"loss": 1.2881,
"mean_token_accuracy": 0.6494873613119125,
"num_tokens": 7008652.0,
"step": 538
},
{
"epoch": 0.34496,
"grad_norm": 5.740390777587891,
"learning_rate": 5e-06,
"loss": 1.3778,
"mean_token_accuracy": 0.6532981097698212,
"num_tokens": 7020915.0,
"step": 539
},
{
"epoch": 0.3456,
"grad_norm": 5.474863529205322,
"learning_rate": 5e-06,
"loss": 1.1128,
"mean_token_accuracy": 0.7052044421434402,
"num_tokens": 7032119.0,
"step": 540
},
{
"epoch": 0.34624,
"grad_norm": 4.56429386138916,
"learning_rate": 5e-06,
"loss": 1.2831,
"mean_token_accuracy": 0.6683964505791664,
"num_tokens": 7045586.0,
"step": 541
},
{
"epoch": 0.34688,
"grad_norm": 3.815187454223633,
"learning_rate": 5e-06,
"loss": 1.1035,
"mean_token_accuracy": 0.7000684291124344,
"num_tokens": 7057244.0,
"step": 542
},
{
"epoch": 0.34752,
"grad_norm": 6.026943683624268,
"learning_rate": 5e-06,
"loss": 1.2064,
"mean_token_accuracy": 0.6807686313986778,
"num_tokens": 7068752.0,
"step": 543
},
{
"epoch": 0.34816,
"grad_norm": 4.224482536315918,
"learning_rate": 5e-06,
"loss": 1.2508,
"mean_token_accuracy": 0.6724436059594154,
"num_tokens": 7082898.0,
"step": 544
},
{
"epoch": 0.3488,
"grad_norm": 7.96382474899292,
"learning_rate": 5e-06,
"loss": 1.1555,
"mean_token_accuracy": 0.6773002594709396,
"num_tokens": 7095886.0,
"step": 545
},
{
"epoch": 0.34944,
"grad_norm": 4.775862693786621,
"learning_rate": 5e-06,
"loss": 1.2992,
"mean_token_accuracy": 0.6667480766773224,
"num_tokens": 7107762.0,
"step": 546
},
{
"epoch": 0.35008,
"grad_norm": 3.49412202835083,
"learning_rate": 5e-06,
"loss": 1.1785,
"mean_token_accuracy": 0.7002041935920715,
"num_tokens": 7121108.0,
"step": 547
},
{
"epoch": 0.35072,
"grad_norm": 4.250086784362793,
"learning_rate": 5e-06,
"loss": 1.1921,
"mean_token_accuracy": 0.7101981267333031,
"num_tokens": 7132883.0,
"step": 548
},
{
"epoch": 0.35136,
"grad_norm": 3.9039688110351562,
"learning_rate": 5e-06,
"loss": 1.2883,
"mean_token_accuracy": 0.6617401614785194,
"num_tokens": 7146456.0,
"step": 549
},
{
"epoch": 0.352,
"grad_norm": 3.8325276374816895,
"learning_rate": 5e-06,
"loss": 1.315,
"mean_token_accuracy": 0.6657034084200859,
"num_tokens": 7160223.0,
"step": 550
},
{
"epoch": 0.35264,
"grad_norm": 5.472667217254639,
"learning_rate": 5e-06,
"loss": 1.4434,
"mean_token_accuracy": 0.635408416390419,
"num_tokens": 7175945.0,
"step": 551
},
{
"epoch": 0.35328,
"grad_norm": 4.009690761566162,
"learning_rate": 5e-06,
"loss": 1.3101,
"mean_token_accuracy": 0.6696026399731636,
"num_tokens": 7189467.0,
"step": 552
},
{
"epoch": 0.35392,
"grad_norm": 4.114287853240967,
"learning_rate": 5e-06,
"loss": 1.2527,
"mean_token_accuracy": 0.669425942003727,
"num_tokens": 7201319.0,
"step": 553
},
{
"epoch": 0.35456,
"grad_norm": 4.302579402923584,
"learning_rate": 5e-06,
"loss": 1.3494,
"mean_token_accuracy": 0.63911372423172,
"num_tokens": 7213327.0,
"step": 554
},
{
"epoch": 0.3552,
"grad_norm": 3.737901210784912,
"learning_rate": 5e-06,
"loss": 1.3994,
"mean_token_accuracy": 0.6371675282716751,
"num_tokens": 7226217.0,
"step": 555
},
{
"epoch": 0.35584,
"grad_norm": 3.517141103744507,
"learning_rate": 5e-06,
"loss": 1.5393,
"mean_token_accuracy": 0.615730918943882,
"num_tokens": 7240381.0,
"step": 556
},
{
"epoch": 0.35648,
"grad_norm": 4.263305187225342,
"learning_rate": 5e-06,
"loss": 1.1575,
"mean_token_accuracy": 0.6844438910484314,
"num_tokens": 7251375.0,
"step": 557
},
{
"epoch": 0.35712,
"grad_norm": 4.197317600250244,
"learning_rate": 5e-06,
"loss": 1.3062,
"mean_token_accuracy": 0.642548106610775,
"num_tokens": 7265647.0,
"step": 558
},
{
"epoch": 0.35776,
"grad_norm": 4.2730560302734375,
"learning_rate": 5e-06,
"loss": 1.2598,
"mean_token_accuracy": 0.6705774366855621,
"num_tokens": 7277240.0,
"step": 559
},
{
"epoch": 0.3584,
"grad_norm": 5.627854347229004,
"learning_rate": 5e-06,
"loss": 1.133,
"mean_token_accuracy": 0.6944706663489342,
"num_tokens": 7293050.0,
"step": 560
},
{
"epoch": 0.35904,
"grad_norm": 5.039371013641357,
"learning_rate": 5e-06,
"loss": 1.3061,
"mean_token_accuracy": 0.652328722178936,
"num_tokens": 7305212.0,
"step": 561
},
{
"epoch": 0.35968,
"grad_norm": 4.255235195159912,
"learning_rate": 5e-06,
"loss": 1.2425,
"mean_token_accuracy": 0.6588255614042282,
"num_tokens": 7318058.0,
"step": 562
},
{
"epoch": 0.36032,
"grad_norm": 3.5205321311950684,
"learning_rate": 5e-06,
"loss": 1.3016,
"mean_token_accuracy": 0.660710796713829,
"num_tokens": 7330484.0,
"step": 563
},
{
"epoch": 0.36096,
"grad_norm": 4.263877868652344,
"learning_rate": 5e-06,
"loss": 1.2689,
"mean_token_accuracy": 0.6572084054350853,
"num_tokens": 7342403.0,
"step": 564
},
{
"epoch": 0.3616,
"grad_norm": 3.9740233421325684,
"learning_rate": 5e-06,
"loss": 1.1917,
"mean_token_accuracy": 0.6762436851859093,
"num_tokens": 7353974.0,
"step": 565
},
{
"epoch": 0.36224,
"grad_norm": 3.4019787311553955,
"learning_rate": 5e-06,
"loss": 1.4559,
"mean_token_accuracy": 0.6335306763648987,
"num_tokens": 7371145.0,
"step": 566
},
{
"epoch": 0.36288,
"grad_norm": 3.6773386001586914,
"learning_rate": 5e-06,
"loss": 1.3727,
"mean_token_accuracy": 0.6477261707186699,
"num_tokens": 7385503.0,
"step": 567
},
{
"epoch": 0.36352,
"grad_norm": 3.533553123474121,
"learning_rate": 5e-06,
"loss": 1.3915,
"mean_token_accuracy": 0.6359102874994278,
"num_tokens": 7397974.0,
"step": 568
},
{
"epoch": 0.36416,
"grad_norm": 4.083873271942139,
"learning_rate": 5e-06,
"loss": 1.2959,
"mean_token_accuracy": 0.6282073631882668,
"num_tokens": 7409071.0,
"step": 569
},
{
"epoch": 0.3648,
"grad_norm": 3.371812582015991,
"learning_rate": 5e-06,
"loss": 1.3724,
"mean_token_accuracy": 0.6561341881752014,
"num_tokens": 7425801.0,
"step": 570
},
{
"epoch": 0.36544,
"grad_norm": 5.2290425300598145,
"learning_rate": 5e-06,
"loss": 1.3808,
"mean_token_accuracy": 0.66990677267313,
"num_tokens": 7435083.0,
"step": 571
},
{
"epoch": 0.36608,
"grad_norm": 3.8227179050445557,
"learning_rate": 5e-06,
"loss": 1.2523,
"mean_token_accuracy": 0.6640745401382446,
"num_tokens": 7445874.0,
"step": 572
},
{
"epoch": 0.36672,
"grad_norm": 3.826213836669922,
"learning_rate": 5e-06,
"loss": 1.2955,
"mean_token_accuracy": 0.6566179618239403,
"num_tokens": 7458792.0,
"step": 573
},
{
"epoch": 0.36736,
"grad_norm": 3.166212558746338,
"learning_rate": 5e-06,
"loss": 1.3854,
"mean_token_accuracy": 0.6362887248396873,
"num_tokens": 7474364.0,
"step": 574
},
{
"epoch": 0.368,
"grad_norm": 3.225037097930908,
"learning_rate": 5e-06,
"loss": 1.4097,
"mean_token_accuracy": 0.6328474953770638,
"num_tokens": 7489627.0,
"step": 575
},
{
"epoch": 0.36864,
"grad_norm": 4.110698699951172,
"learning_rate": 5e-06,
"loss": 1.2922,
"mean_token_accuracy": 0.6546645760536194,
"num_tokens": 7501958.0,
"step": 576
},
{
"epoch": 0.36928,
"grad_norm": 3.426607608795166,
"learning_rate": 5e-06,
"loss": 1.4665,
"mean_token_accuracy": 0.6124880164861679,
"num_tokens": 7515385.0,
"step": 577
},
{
"epoch": 0.36992,
"grad_norm": 3.6768105030059814,
"learning_rate": 5e-06,
"loss": 1.2686,
"mean_token_accuracy": 0.6734522432088852,
"num_tokens": 7528618.0,
"step": 578
},
{
"epoch": 0.37056,
"grad_norm": 3.3351573944091797,
"learning_rate": 5e-06,
"loss": 1.351,
"mean_token_accuracy": 0.6506511121988297,
"num_tokens": 7543651.0,
"step": 579
},
{
"epoch": 0.3712,
"grad_norm": 4.15482759475708,
"learning_rate": 5e-06,
"loss": 1.3044,
"mean_token_accuracy": 0.6768280491232872,
"num_tokens": 7554851.0,
"step": 580
},
{
"epoch": 0.37184,
"grad_norm": 4.212845802307129,
"learning_rate": 5e-06,
"loss": 1.3038,
"mean_token_accuracy": 0.6363187730312347,
"num_tokens": 7567001.0,
"step": 581
},
{
"epoch": 0.37248,
"grad_norm": 4.185598850250244,
"learning_rate": 5e-06,
"loss": 1.3491,
"mean_token_accuracy": 0.6468858942389488,
"num_tokens": 7579432.0,
"step": 582
},
{
"epoch": 0.37312,
"grad_norm": 3.4942967891693115,
"learning_rate": 5e-06,
"loss": 1.364,
"mean_token_accuracy": 0.650618351995945,
"num_tokens": 7593471.0,
"step": 583
},
{
"epoch": 0.37376,
"grad_norm": 3.4861021041870117,
"learning_rate": 5e-06,
"loss": 1.1446,
"mean_token_accuracy": 0.6844891607761383,
"num_tokens": 7608553.0,
"step": 584
},
{
"epoch": 0.3744,
"grad_norm": 3.893850803375244,
"learning_rate": 5e-06,
"loss": 1.458,
"mean_token_accuracy": 0.621130146086216,
"num_tokens": 7621367.0,
"step": 585
},
{
"epoch": 0.37504,
"grad_norm": 3.653973340988159,
"learning_rate": 5e-06,
"loss": 1.3408,
"mean_token_accuracy": 0.6439371258020401,
"num_tokens": 7634694.0,
"step": 586
},
{
"epoch": 0.37568,
"grad_norm": 3.94148850440979,
"learning_rate": 5e-06,
"loss": 1.4367,
"mean_token_accuracy": 0.6162022799253464,
"num_tokens": 7646183.0,
"step": 587
},
{
"epoch": 0.37632,
"grad_norm": 3.2505555152893066,
"learning_rate": 5e-06,
"loss": 1.4763,
"mean_token_accuracy": 0.6298285201191902,
"num_tokens": 7661964.0,
"step": 588
},
{
"epoch": 0.37696,
"grad_norm": 3.1683783531188965,
"learning_rate": 5e-06,
"loss": 1.18,
"mean_token_accuracy": 0.6652778312563896,
"num_tokens": 7676405.0,
"step": 589
},
{
"epoch": 0.3776,
"grad_norm": 4.221475601196289,
"learning_rate": 5e-06,
"loss": 1.2668,
"mean_token_accuracy": 0.6732780113816261,
"num_tokens": 7690168.0,
"step": 590
},
{
"epoch": 0.37824,
"grad_norm": 4.262617111206055,
"learning_rate": 5e-06,
"loss": 1.2225,
"mean_token_accuracy": 0.6608899086713791,
"num_tokens": 7700128.0,
"step": 591
},
{
"epoch": 0.37888,
"grad_norm": 3.564286231994629,
"learning_rate": 5e-06,
"loss": 1.4015,
"mean_token_accuracy": 0.6410685330629349,
"num_tokens": 7711700.0,
"step": 592
},
{
"epoch": 0.37952,
"grad_norm": 3.5013587474823,
"learning_rate": 5e-06,
"loss": 1.0532,
"mean_token_accuracy": 0.7084442153573036,
"num_tokens": 7722670.0,
"step": 593
},
{
"epoch": 0.38016,
"grad_norm": 4.010073661804199,
"learning_rate": 5e-06,
"loss": 1.2001,
"mean_token_accuracy": 0.6808509230613708,
"num_tokens": 7734476.0,
"step": 594
},
{
"epoch": 0.3808,
"grad_norm": 3.8407106399536133,
"learning_rate": 5e-06,
"loss": 1.2772,
"mean_token_accuracy": 0.661079652607441,
"num_tokens": 7747191.0,
"step": 595
},
{
"epoch": 0.38144,
"grad_norm": 3.9195191860198975,
"learning_rate": 5e-06,
"loss": 1.2063,
"mean_token_accuracy": 0.6758553683757782,
"num_tokens": 7757542.0,
"step": 596
},
{
"epoch": 0.38208,
"grad_norm": 5.459002494812012,
"learning_rate": 5e-06,
"loss": 1.4173,
"mean_token_accuracy": 0.6381874680519104,
"num_tokens": 7770427.0,
"step": 597
},
{
"epoch": 0.38272,
"grad_norm": 4.335068702697754,
"learning_rate": 5e-06,
"loss": 1.2628,
"mean_token_accuracy": 0.6815316006541252,
"num_tokens": 7781771.0,
"step": 598
},
{
"epoch": 0.38336,
"grad_norm": 3.5294859409332275,
"learning_rate": 5e-06,
"loss": 1.1975,
"mean_token_accuracy": 0.6658317893743515,
"num_tokens": 7796104.0,
"step": 599
},
{
"epoch": 0.384,
"grad_norm": 5.696824550628662,
"learning_rate": 5e-06,
"loss": 1.3531,
"mean_token_accuracy": 0.6557957530021667,
"num_tokens": 7808117.0,
"step": 600
},
{
"epoch": 0.38464,
"grad_norm": 3.5926239490509033,
"learning_rate": 5e-06,
"loss": 1.276,
"mean_token_accuracy": 0.6641542464494705,
"num_tokens": 7819726.0,
"step": 601
},
{
"epoch": 0.38528,
"grad_norm": 3.8258309364318848,
"learning_rate": 5e-06,
"loss": 1.3628,
"mean_token_accuracy": 0.6388561427593231,
"num_tokens": 7833205.0,
"step": 602
},
{
"epoch": 0.38592,
"grad_norm": 8.0659761428833,
"learning_rate": 5e-06,
"loss": 1.2999,
"mean_token_accuracy": 0.6774822995066643,
"num_tokens": 7847836.0,
"step": 603
},
{
"epoch": 0.38656,
"grad_norm": 3.9128899574279785,
"learning_rate": 5e-06,
"loss": 1.1609,
"mean_token_accuracy": 0.6698524802923203,
"num_tokens": 7860682.0,
"step": 604
},
{
"epoch": 0.3872,
"grad_norm": 4.181707382202148,
"learning_rate": 5e-06,
"loss": 1.5005,
"mean_token_accuracy": 0.6392693892121315,
"num_tokens": 7872637.0,
"step": 605
},
{
"epoch": 0.38784,
"grad_norm": 4.092964172363281,
"learning_rate": 5e-06,
"loss": 1.4009,
"mean_token_accuracy": 0.6333677843213081,
"num_tokens": 7884650.0,
"step": 606
},
{
"epoch": 0.38848,
"grad_norm": 4.87518835067749,
"learning_rate": 5e-06,
"loss": 1.3242,
"mean_token_accuracy": 0.6464787498116493,
"num_tokens": 7898052.0,
"step": 607
},
{
"epoch": 0.38912,
"grad_norm": 4.693853855133057,
"learning_rate": 5e-06,
"loss": 1.3407,
"mean_token_accuracy": 0.6689095348119736,
"num_tokens": 7912216.0,
"step": 608
},
{
"epoch": 0.38976,
"grad_norm": 4.081251621246338,
"learning_rate": 5e-06,
"loss": 1.3657,
"mean_token_accuracy": 0.6710544601082802,
"num_tokens": 7924053.0,
"step": 609
},
{
"epoch": 0.3904,
"grad_norm": 25.89602279663086,
"learning_rate": 5e-06,
"loss": 1.3437,
"mean_token_accuracy": 0.6381836906075478,
"num_tokens": 7936238.0,
"step": 610
},
{
"epoch": 0.39104,
"grad_norm": 5.2313055992126465,
"learning_rate": 5e-06,
"loss": 1.341,
"mean_token_accuracy": 0.6564094573259354,
"num_tokens": 7948038.0,
"step": 611
},
{
"epoch": 0.39168,
"grad_norm": 3.936708927154541,
"learning_rate": 5e-06,
"loss": 1.3564,
"mean_token_accuracy": 0.6562279239296913,
"num_tokens": 7960193.0,
"step": 612
},
{
"epoch": 0.39232,
"grad_norm": 5.367516994476318,
"learning_rate": 5e-06,
"loss": 1.2065,
"mean_token_accuracy": 0.6724315732717514,
"num_tokens": 7974200.0,
"step": 613
},
{
"epoch": 0.39296,
"grad_norm": 4.628478527069092,
"learning_rate": 5e-06,
"loss": 1.4374,
"mean_token_accuracy": 0.6563806012272835,
"num_tokens": 7988501.0,
"step": 614
},
{
"epoch": 0.3936,
"grad_norm": 5.892339706420898,
"learning_rate": 5e-06,
"loss": 1.2956,
"mean_token_accuracy": 0.6271971762180328,
"num_tokens": 8000663.0,
"step": 615
},
{
"epoch": 0.39424,
"grad_norm": 7.812566757202148,
"learning_rate": 5e-06,
"loss": 1.294,
"mean_token_accuracy": 0.6547481939196587,
"num_tokens": 8014228.0,
"step": 616
},
{
"epoch": 0.39488,
"grad_norm": 3.2736427783966064,
"learning_rate": 5e-06,
"loss": 1.0853,
"mean_token_accuracy": 0.7260592132806778,
"num_tokens": 8028431.0,
"step": 617
},
{
"epoch": 0.39552,
"grad_norm": 4.344757080078125,
"learning_rate": 5e-06,
"loss": 1.1555,
"mean_token_accuracy": 0.6958058997988701,
"num_tokens": 8038986.0,
"step": 618
},
{
"epoch": 0.39616,
"grad_norm": 4.368517875671387,
"learning_rate": 5e-06,
"loss": 1.2744,
"mean_token_accuracy": 0.6637471318244934,
"num_tokens": 8050087.0,
"step": 619
},
{
"epoch": 0.3968,
"grad_norm": 3.57128643989563,
"learning_rate": 5e-06,
"loss": 1.4689,
"mean_token_accuracy": 0.6049885600805283,
"num_tokens": 8063113.0,
"step": 620
},
{
"epoch": 0.39744,
"grad_norm": 3.331205368041992,
"learning_rate": 5e-06,
"loss": 1.4329,
"mean_token_accuracy": 0.6517436727881432,
"num_tokens": 8079228.0,
"step": 621
},
{
"epoch": 0.39808,
"grad_norm": 4.018087863922119,
"learning_rate": 5e-06,
"loss": 1.0947,
"mean_token_accuracy": 0.696721188724041,
"num_tokens": 8092023.0,
"step": 622
},
{
"epoch": 0.39872,
"grad_norm": 3.527395486831665,
"learning_rate": 5e-06,
"loss": 1.3711,
"mean_token_accuracy": 0.6707320511341095,
"num_tokens": 8107080.0,
"step": 623
},
{
"epoch": 0.39936,
"grad_norm": 3.9434077739715576,
"learning_rate": 5e-06,
"loss": 1.4069,
"mean_token_accuracy": 0.6350800693035126,
"num_tokens": 8120374.0,
"step": 624
},
{
"epoch": 0.4,
"grad_norm": 9.819520950317383,
"learning_rate": 5e-06,
"loss": 1.3298,
"mean_token_accuracy": 0.6648931205272675,
"num_tokens": 8131526.0,
"step": 625
},
{
"epoch": 0.40064,
"grad_norm": 3.7477002143859863,
"learning_rate": 5e-06,
"loss": 1.3742,
"mean_token_accuracy": 0.6332258731126785,
"num_tokens": 8144842.0,
"step": 626
},
{
"epoch": 0.40128,
"grad_norm": 3.494314432144165,
"learning_rate": 5e-06,
"loss": 1.4336,
"mean_token_accuracy": 0.6268866658210754,
"num_tokens": 8157481.0,
"step": 627
},
{
"epoch": 0.40192,
"grad_norm": 4.175013542175293,
"learning_rate": 5e-06,
"loss": 1.1405,
"mean_token_accuracy": 0.6741645857691765,
"num_tokens": 8168266.0,
"step": 628
},
{
"epoch": 0.40256,
"grad_norm": 3.4416167736053467,
"learning_rate": 5e-06,
"loss": 1.3,
"mean_token_accuracy": 0.6660801768302917,
"num_tokens": 8182677.0,
"step": 629
},
{
"epoch": 0.4032,
"grad_norm": 4.334346771240234,
"learning_rate": 5e-06,
"loss": 1.2434,
"mean_token_accuracy": 0.6613388434052467,
"num_tokens": 8195270.0,
"step": 630
},
{
"epoch": 0.40384,
"grad_norm": 3.4856371879577637,
"learning_rate": 5e-06,
"loss": 1.3815,
"mean_token_accuracy": 0.6581474095582962,
"num_tokens": 8209764.0,
"step": 631
},
{
"epoch": 0.40448,
"grad_norm": 4.153162002563477,
"learning_rate": 5e-06,
"loss": 1.3904,
"mean_token_accuracy": 0.6714613437652588,
"num_tokens": 8220815.0,
"step": 632
},
{
"epoch": 0.40512,
"grad_norm": 4.055039405822754,
"learning_rate": 5e-06,
"loss": 1.3551,
"mean_token_accuracy": 0.6623844504356384,
"num_tokens": 8234827.0,
"step": 633
},
{
"epoch": 0.40576,
"grad_norm": 3.558887481689453,
"learning_rate": 5e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.6388072147965431,
"num_tokens": 8246889.0,
"step": 634
},
{
"epoch": 0.4064,
"grad_norm": 4.767853736877441,
"learning_rate": 5e-06,
"loss": 1.6547,
"mean_token_accuracy": 0.5957511439919472,
"num_tokens": 8259650.0,
"step": 635
},
{
"epoch": 0.40704,
"grad_norm": 3.799283504486084,
"learning_rate": 5e-06,
"loss": 1.2804,
"mean_token_accuracy": 0.676610916852951,
"num_tokens": 8271638.0,
"step": 636
},
{
"epoch": 0.40768,
"grad_norm": 3.697746753692627,
"learning_rate": 5e-06,
"loss": 1.3211,
"mean_token_accuracy": 0.6482478119432926,
"num_tokens": 8285204.0,
"step": 637
},
{
"epoch": 0.40832,
"grad_norm": 3.6978259086608887,
"learning_rate": 5e-06,
"loss": 1.2889,
"mean_token_accuracy": 0.6479171589016914,
"num_tokens": 8297791.0,
"step": 638
},
{
"epoch": 0.40896,
"grad_norm": 4.445859432220459,
"learning_rate": 5e-06,
"loss": 1.4383,
"mean_token_accuracy": 0.6379449293017387,
"num_tokens": 8308088.0,
"step": 639
},
{
"epoch": 0.4096,
"grad_norm": 3.462293863296509,
"learning_rate": 5e-06,
"loss": 1.1934,
"mean_token_accuracy": 0.6681175008416176,
"num_tokens": 8322994.0,
"step": 640
},
{
"epoch": 0.41024,
"grad_norm": 3.471963405609131,
"learning_rate": 5e-06,
"loss": 1.4795,
"mean_token_accuracy": 0.6364353597164154,
"num_tokens": 8336239.0,
"step": 641
},
{
"epoch": 0.41088,
"grad_norm": 4.054087162017822,
"learning_rate": 5e-06,
"loss": 1.4736,
"mean_token_accuracy": 0.6363670602440834,
"num_tokens": 8347731.0,
"step": 642
},
{
"epoch": 0.41152,
"grad_norm": 3.717003583908081,
"learning_rate": 5e-06,
"loss": 1.4482,
"mean_token_accuracy": 0.6272126361727715,
"num_tokens": 8359999.0,
"step": 643
},
{
"epoch": 0.41216,
"grad_norm": 3.302205801010132,
"learning_rate": 5e-06,
"loss": 1.0763,
"mean_token_accuracy": 0.6774929463863373,
"num_tokens": 8373684.0,
"step": 644
},
{
"epoch": 0.4128,
"grad_norm": 3.4035229682922363,
"learning_rate": 5e-06,
"loss": 1.2946,
"mean_token_accuracy": 0.6636142283678055,
"num_tokens": 8387313.0,
"step": 645
},
{
"epoch": 0.41344,
"grad_norm": 3.3320178985595703,
"learning_rate": 5e-06,
"loss": 1.2419,
"mean_token_accuracy": 0.6601713374257088,
"num_tokens": 8401909.0,
"step": 646
},
{
"epoch": 0.41408,
"grad_norm": 4.073376178741455,
"learning_rate": 5e-06,
"loss": 1.392,
"mean_token_accuracy": 0.6541470885276794,
"num_tokens": 8412682.0,
"step": 647
},
{
"epoch": 0.41472,
"grad_norm": 3.4275381565093994,
"learning_rate": 5e-06,
"loss": 1.2128,
"mean_token_accuracy": 0.7010362893342972,
"num_tokens": 8425791.0,
"step": 648
},
{
"epoch": 0.41536,
"grad_norm": 3.013326644897461,
"learning_rate": 5e-06,
"loss": 1.2823,
"mean_token_accuracy": 0.6790317669510841,
"num_tokens": 8441869.0,
"step": 649
},
{
"epoch": 0.416,
"grad_norm": 3.8601441383361816,
"learning_rate": 5e-06,
"loss": 1.1898,
"mean_token_accuracy": 0.6818736344575882,
"num_tokens": 8453324.0,
"step": 650
},
{
"epoch": 0.41664,
"grad_norm": 4.1017537117004395,
"learning_rate": 5e-06,
"loss": 1.2861,
"mean_token_accuracy": 0.6557259410619736,
"num_tokens": 8465258.0,
"step": 651
},
{
"epoch": 0.41728,
"grad_norm": 4.002110481262207,
"learning_rate": 5e-06,
"loss": 1.3043,
"mean_token_accuracy": 0.6684800609946251,
"num_tokens": 8477574.0,
"step": 652
},
{
"epoch": 0.41792,
"grad_norm": 3.2750160694122314,
"learning_rate": 5e-06,
"loss": 1.137,
"mean_token_accuracy": 0.6889987885951996,
"num_tokens": 8493134.0,
"step": 653
},
{
"epoch": 0.41856,
"grad_norm": 4.388451099395752,
"learning_rate": 5e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.6810869425535202,
"num_tokens": 8504478.0,
"step": 654
},
{
"epoch": 0.4192,
"grad_norm": 4.337303161621094,
"learning_rate": 5e-06,
"loss": 1.2395,
"mean_token_accuracy": 0.6621519103646278,
"num_tokens": 8515776.0,
"step": 655
},
{
"epoch": 0.41984,
"grad_norm": 3.3329954147338867,
"learning_rate": 5e-06,
"loss": 1.2419,
"mean_token_accuracy": 0.6687774360179901,
"num_tokens": 8530155.0,
"step": 656
},
{
"epoch": 0.42048,
"grad_norm": 4.358274459838867,
"learning_rate": 5e-06,
"loss": 1.4287,
"mean_token_accuracy": 0.6304730176925659,
"num_tokens": 8541253.0,
"step": 657
},
{
"epoch": 0.42112,
"grad_norm": 3.5478384494781494,
"learning_rate": 5e-06,
"loss": 1.0695,
"mean_token_accuracy": 0.7162381857633591,
"num_tokens": 8555920.0,
"step": 658
},
{
"epoch": 0.42176,
"grad_norm": 3.6502294540405273,
"learning_rate": 5e-06,
"loss": 1.5178,
"mean_token_accuracy": 0.6263556554913521,
"num_tokens": 8568098.0,
"step": 659
},
{
"epoch": 0.4224,
"grad_norm": 3.196720838546753,
"learning_rate": 5e-06,
"loss": 1.3436,
"mean_token_accuracy": 0.6646198481321335,
"num_tokens": 8583176.0,
"step": 660
},
{
"epoch": 0.42304,
"grad_norm": 4.109900951385498,
"learning_rate": 5e-06,
"loss": 1.2928,
"mean_token_accuracy": 0.6598446816205978,
"num_tokens": 8593933.0,
"step": 661
},
{
"epoch": 0.42368,
"grad_norm": 3.258894205093384,
"learning_rate": 5e-06,
"loss": 1.1133,
"mean_token_accuracy": 0.6876930743455887,
"num_tokens": 8607476.0,
"step": 662
},
{
"epoch": 0.42432,
"grad_norm": 3.369394063949585,
"learning_rate": 5e-06,
"loss": 1.4229,
"mean_token_accuracy": 0.6260672360658646,
"num_tokens": 8620304.0,
"step": 663
},
{
"epoch": 0.42496,
"grad_norm": 2.924621820449829,
"learning_rate": 5e-06,
"loss": 1.3395,
"mean_token_accuracy": 0.6627652049064636,
"num_tokens": 8637128.0,
"step": 664
},
{
"epoch": 0.4256,
"grad_norm": 3.057220458984375,
"learning_rate": 5e-06,
"loss": 1.3794,
"mean_token_accuracy": 0.6343020871281624,
"num_tokens": 8653441.0,
"step": 665
},
{
"epoch": 0.42624,
"grad_norm": 3.4192020893096924,
"learning_rate": 5e-06,
"loss": 1.2575,
"mean_token_accuracy": 0.6774614155292511,
"num_tokens": 8667588.0,
"step": 666
},
{
"epoch": 0.42688,
"grad_norm": 3.5300302505493164,
"learning_rate": 5e-06,
"loss": 1.4013,
"mean_token_accuracy": 0.6498018577694893,
"num_tokens": 8680263.0,
"step": 667
},
{
"epoch": 0.42752,
"grad_norm": 4.497870922088623,
"learning_rate": 5e-06,
"loss": 1.3489,
"mean_token_accuracy": 0.6535830795764923,
"num_tokens": 8691240.0,
"step": 668
},
{
"epoch": 0.42816,
"grad_norm": 4.537415504455566,
"learning_rate": 5e-06,
"loss": 1.1375,
"mean_token_accuracy": 0.6975407898426056,
"num_tokens": 8703412.0,
"step": 669
},
{
"epoch": 0.4288,
"grad_norm": 4.318458080291748,
"learning_rate": 5e-06,
"loss": 1.3989,
"mean_token_accuracy": 0.6657192297279835,
"num_tokens": 8715529.0,
"step": 670
},
{
"epoch": 0.42944,
"grad_norm": 3.754676580429077,
"learning_rate": 5e-06,
"loss": 1.3016,
"mean_token_accuracy": 0.6766445562243462,
"num_tokens": 8727543.0,
"step": 671
},
{
"epoch": 0.43008,
"grad_norm": 3.548112630844116,
"learning_rate": 5e-06,
"loss": 1.2104,
"mean_token_accuracy": 0.672496572136879,
"num_tokens": 8741690.0,
"step": 672
},
{
"epoch": 0.43072,
"grad_norm": 3.170255422592163,
"learning_rate": 5e-06,
"loss": 1.2556,
"mean_token_accuracy": 0.6616998463869095,
"num_tokens": 8756679.0,
"step": 673
},
{
"epoch": 0.43136,
"grad_norm": 4.215174674987793,
"learning_rate": 5e-06,
"loss": 1.3897,
"mean_token_accuracy": 0.6362641379237175,
"num_tokens": 8766546.0,
"step": 674
},
{
"epoch": 0.432,
"grad_norm": 3.93945050239563,
"learning_rate": 5e-06,
"loss": 1.5469,
"mean_token_accuracy": 0.6284241452813148,
"num_tokens": 8777535.0,
"step": 675
},
{
"epoch": 0.43264,
"grad_norm": 3.380723237991333,
"learning_rate": 5e-06,
"loss": 1.2613,
"mean_token_accuracy": 0.6732500046491623,
"num_tokens": 8790959.0,
"step": 676
},
{
"epoch": 0.43328,
"grad_norm": 3.63143253326416,
"learning_rate": 5e-06,
"loss": 1.3131,
"mean_token_accuracy": 0.6863637119531631,
"num_tokens": 8804351.0,
"step": 677
},
{
"epoch": 0.43392,
"grad_norm": 3.2990407943725586,
"learning_rate": 5e-06,
"loss": 1.3531,
"mean_token_accuracy": 0.6465996205806732,
"num_tokens": 8819041.0,
"step": 678
},
{
"epoch": 0.43456,
"grad_norm": 3.4251043796539307,
"learning_rate": 5e-06,
"loss": 1.5304,
"mean_token_accuracy": 0.6462676748633385,
"num_tokens": 8833926.0,
"step": 679
},
{
"epoch": 0.4352,
"grad_norm": 3.7104909420013428,
"learning_rate": 5e-06,
"loss": 1.0284,
"mean_token_accuracy": 0.695975661277771,
"num_tokens": 8845597.0,
"step": 680
},
{
"epoch": 0.43584,
"grad_norm": 3.3179309368133545,
"learning_rate": 5e-06,
"loss": 1.3163,
"mean_token_accuracy": 0.6567405387759209,
"num_tokens": 8860280.0,
"step": 681
},
{
"epoch": 0.43648,
"grad_norm": 3.607069730758667,
"learning_rate": 5e-06,
"loss": 1.3554,
"mean_token_accuracy": 0.6537708342075348,
"num_tokens": 8873793.0,
"step": 682
},
{
"epoch": 0.43712,
"grad_norm": 3.275057554244995,
"learning_rate": 5e-06,
"loss": 1.3613,
"mean_token_accuracy": 0.6499952375888824,
"num_tokens": 8886476.0,
"step": 683
},
{
"epoch": 0.43776,
"grad_norm": 3.3160624504089355,
"learning_rate": 5e-06,
"loss": 1.2898,
"mean_token_accuracy": 0.6765732616186142,
"num_tokens": 8900749.0,
"step": 684
},
{
"epoch": 0.4384,
"grad_norm": 3.347907543182373,
"learning_rate": 5e-06,
"loss": 1.2472,
"mean_token_accuracy": 0.6879568248987198,
"num_tokens": 8913760.0,
"step": 685
},
{
"epoch": 0.43904,
"grad_norm": 3.862211227416992,
"learning_rate": 5e-06,
"loss": 1.424,
"mean_token_accuracy": 0.6521790996193886,
"num_tokens": 8926061.0,
"step": 686
},
{
"epoch": 0.43968,
"grad_norm": 3.4736506938934326,
"learning_rate": 5e-06,
"loss": 1.0292,
"mean_token_accuracy": 0.7187496647238731,
"num_tokens": 8939507.0,
"step": 687
},
{
"epoch": 0.44032,
"grad_norm": 3.3794503211975098,
"learning_rate": 5e-06,
"loss": 1.22,
"mean_token_accuracy": 0.6592478863894939,
"num_tokens": 8952411.0,
"step": 688
},
{
"epoch": 0.44096,
"grad_norm": 3.6152052879333496,
"learning_rate": 5e-06,
"loss": 1.1974,
"mean_token_accuracy": 0.6630196422338486,
"num_tokens": 8966507.0,
"step": 689
},
{
"epoch": 0.4416,
"grad_norm": 4.036067485809326,
"learning_rate": 5e-06,
"loss": 1.3094,
"mean_token_accuracy": 0.6584384590387344,
"num_tokens": 8978156.0,
"step": 690
},
{
"epoch": 0.44224,
"grad_norm": 3.740229845046997,
"learning_rate": 5e-06,
"loss": 1.3378,
"mean_token_accuracy": 0.6580435633659363,
"num_tokens": 8990929.0,
"step": 691
},
{
"epoch": 0.44288,
"grad_norm": 3.417703866958618,
"learning_rate": 5e-06,
"loss": 1.4495,
"mean_token_accuracy": 0.6308450028300285,
"num_tokens": 9004923.0,
"step": 692
},
{
"epoch": 0.44352,
"grad_norm": 4.16903829574585,
"learning_rate": 5e-06,
"loss": 1.4896,
"mean_token_accuracy": 0.6304129362106323,
"num_tokens": 9018493.0,
"step": 693
},
{
"epoch": 0.44416,
"grad_norm": 3.383941411972046,
"learning_rate": 5e-06,
"loss": 1.4175,
"mean_token_accuracy": 0.6552764996886253,
"num_tokens": 9032465.0,
"step": 694
},
{
"epoch": 0.4448,
"grad_norm": 3.398747205734253,
"learning_rate": 5e-06,
"loss": 1.2548,
"mean_token_accuracy": 0.6547529026865959,
"num_tokens": 9045706.0,
"step": 695
},
{
"epoch": 0.44544,
"grad_norm": 3.575016975402832,
"learning_rate": 5e-06,
"loss": 1.3807,
"mean_token_accuracy": 0.6451460421085358,
"num_tokens": 9059033.0,
"step": 696
},
{
"epoch": 0.44608,
"grad_norm": 3.3936767578125,
"learning_rate": 5e-06,
"loss": 1.2398,
"mean_token_accuracy": 0.6596207022666931,
"num_tokens": 9072068.0,
"step": 697
},
{
"epoch": 0.44672,
"grad_norm": 3.675055980682373,
"learning_rate": 5e-06,
"loss": 1.299,
"mean_token_accuracy": 0.6535976231098175,
"num_tokens": 9084713.0,
"step": 698
},
{
"epoch": 0.44736,
"grad_norm": 3.564359426498413,
"learning_rate": 5e-06,
"loss": 1.3035,
"mean_token_accuracy": 0.70219536870718,
"num_tokens": 9097477.0,
"step": 699
},
{
"epoch": 0.448,
"grad_norm": 3.400031566619873,
"learning_rate": 5e-06,
"loss": 1.4423,
"mean_token_accuracy": 0.6343613564968109,
"num_tokens": 9112699.0,
"step": 700
},
{
"epoch": 0.44864,
"grad_norm": 3.9619691371917725,
"learning_rate": 5e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.6419604942202568,
"num_tokens": 9124943.0,
"step": 701
},
{
"epoch": 0.44928,
"grad_norm": 3.6950299739837646,
"learning_rate": 5e-06,
"loss": 1.1939,
"mean_token_accuracy": 0.6645899340510368,
"num_tokens": 9136459.0,
"step": 702
},
{
"epoch": 0.44992,
"grad_norm": 2.9667203426361084,
"learning_rate": 5e-06,
"loss": 1.0622,
"mean_token_accuracy": 0.7160019502043724,
"num_tokens": 9153604.0,
"step": 703
},
{
"epoch": 0.45056,
"grad_norm": 3.338284969329834,
"learning_rate": 5e-06,
"loss": 1.3143,
"mean_token_accuracy": 0.6503826230764389,
"num_tokens": 9168475.0,
"step": 704
},
{
"epoch": 0.4512,
"grad_norm": 3.5206825733184814,
"learning_rate": 5e-06,
"loss": 1.3616,
"mean_token_accuracy": 0.6541919782757759,
"num_tokens": 9180057.0,
"step": 705
},
{
"epoch": 0.45184,
"grad_norm": 3.8366057872772217,
"learning_rate": 5e-06,
"loss": 1.0917,
"mean_token_accuracy": 0.701392412185669,
"num_tokens": 9191539.0,
"step": 706
},
{
"epoch": 0.45248,
"grad_norm": 3.7983529567718506,
"learning_rate": 5e-06,
"loss": 1.3878,
"mean_token_accuracy": 0.6564305797219276,
"num_tokens": 9201977.0,
"step": 707
},
{
"epoch": 0.45312,
"grad_norm": 4.199508190155029,
"learning_rate": 5e-06,
"loss": 1.1504,
"mean_token_accuracy": 0.6836559697985649,
"num_tokens": 9212342.0,
"step": 708
},
{
"epoch": 0.45376,
"grad_norm": 3.685267686843872,
"learning_rate": 5e-06,
"loss": 1.3698,
"mean_token_accuracy": 0.6535738334059715,
"num_tokens": 9225755.0,
"step": 709
},
{
"epoch": 0.4544,
"grad_norm": 3.736710786819458,
"learning_rate": 5e-06,
"loss": 1.2562,
"mean_token_accuracy": 0.6680023595690727,
"num_tokens": 9237738.0,
"step": 710
},
{
"epoch": 0.45504,
"grad_norm": 3.5703136920928955,
"learning_rate": 5e-06,
"loss": 1.5439,
"mean_token_accuracy": 0.6260874792933464,
"num_tokens": 9250443.0,
"step": 711
},
{
"epoch": 0.45568,
"grad_norm": 3.6314592361450195,
"learning_rate": 5e-06,
"loss": 1.2796,
"mean_token_accuracy": 0.6624791696667671,
"num_tokens": 9262486.0,
"step": 712
},
{
"epoch": 0.45632,
"grad_norm": 3.988708019256592,
"learning_rate": 5e-06,
"loss": 1.3134,
"mean_token_accuracy": 0.6787229478359222,
"num_tokens": 9274110.0,
"step": 713
},
{
"epoch": 0.45696,
"grad_norm": 4.141347885131836,
"learning_rate": 5e-06,
"loss": 1.2379,
"mean_token_accuracy": 0.6678915992379189,
"num_tokens": 9285461.0,
"step": 714
},
{
"epoch": 0.4576,
"grad_norm": 4.030619144439697,
"learning_rate": 5e-06,
"loss": 1.4524,
"mean_token_accuracy": 0.6373696550726891,
"num_tokens": 9297793.0,
"step": 715
},
{
"epoch": 0.45824,
"grad_norm": 3.8397583961486816,
"learning_rate": 5e-06,
"loss": 1.3026,
"mean_token_accuracy": 0.6478614434599876,
"num_tokens": 9309837.0,
"step": 716
},
{
"epoch": 0.45888,
"grad_norm": 3.211944580078125,
"learning_rate": 5e-06,
"loss": 1.3505,
"mean_token_accuracy": 0.6571612730622292,
"num_tokens": 9325986.0,
"step": 717
},
{
"epoch": 0.45952,
"grad_norm": 3.775752305984497,
"learning_rate": 5e-06,
"loss": 1.4041,
"mean_token_accuracy": 0.6476349085569382,
"num_tokens": 9339126.0,
"step": 718
},
{
"epoch": 0.46016,
"grad_norm": 3.311610221862793,
"learning_rate": 5e-06,
"loss": 1.2113,
"mean_token_accuracy": 0.6731600984930992,
"num_tokens": 9354016.0,
"step": 719
},
{
"epoch": 0.4608,
"grad_norm": 3.6527278423309326,
"learning_rate": 5e-06,
"loss": 1.4461,
"mean_token_accuracy": 0.6173442825675011,
"num_tokens": 9366261.0,
"step": 720
},
{
"epoch": 0.46144,
"grad_norm": 3.3843095302581787,
"learning_rate": 5e-06,
"loss": 1.1579,
"mean_token_accuracy": 0.6829282343387604,
"num_tokens": 9381276.0,
"step": 721
},
{
"epoch": 0.46208,
"grad_norm": 3.229539394378662,
"learning_rate": 5e-06,
"loss": 1.2984,
"mean_token_accuracy": 0.6582097262144089,
"num_tokens": 9395271.0,
"step": 722
},
{
"epoch": 0.46272,
"grad_norm": 3.170426607131958,
"learning_rate": 5e-06,
"loss": 1.2019,
"mean_token_accuracy": 0.6727022156119347,
"num_tokens": 9409204.0,
"step": 723
},
{
"epoch": 0.46336,
"grad_norm": 3.8384881019592285,
"learning_rate": 5e-06,
"loss": 1.3632,
"mean_token_accuracy": 0.6532674580812454,
"num_tokens": 9423838.0,
"step": 724
},
{
"epoch": 0.464,
"grad_norm": 4.176010608673096,
"learning_rate": 5e-06,
"loss": 1.3754,
"mean_token_accuracy": 0.6710385903716087,
"num_tokens": 9434639.0,
"step": 725
},
{
"epoch": 0.46464,
"grad_norm": 3.5365447998046875,
"learning_rate": 5e-06,
"loss": 1.2791,
"mean_token_accuracy": 0.6657568737864494,
"num_tokens": 9446770.0,
"step": 726
},
{
"epoch": 0.46528,
"grad_norm": 3.4129528999328613,
"learning_rate": 5e-06,
"loss": 1.3072,
"mean_token_accuracy": 0.6470441669225693,
"num_tokens": 9460600.0,
"step": 727
},
{
"epoch": 0.46592,
"grad_norm": 4.013781547546387,
"learning_rate": 5e-06,
"loss": 1.3892,
"mean_token_accuracy": 0.656228207051754,
"num_tokens": 9472044.0,
"step": 728
},
{
"epoch": 0.46656,
"grad_norm": 3.449136734008789,
"learning_rate": 5e-06,
"loss": 1.372,
"mean_token_accuracy": 0.6523317843675613,
"num_tokens": 9484363.0,
"step": 729
},
{
"epoch": 0.4672,
"grad_norm": 3.7383124828338623,
"learning_rate": 5e-06,
"loss": 1.3216,
"mean_token_accuracy": 0.6544977352023125,
"num_tokens": 9496620.0,
"step": 730
},
{
"epoch": 0.46784,
"grad_norm": 3.362048864364624,
"learning_rate": 5e-06,
"loss": 1.3853,
"mean_token_accuracy": 0.6726252436637878,
"num_tokens": 9510670.0,
"step": 731
},
{
"epoch": 0.46848,
"grad_norm": 3.314443826675415,
"learning_rate": 5e-06,
"loss": 1.4104,
"mean_token_accuracy": 0.6626652106642723,
"num_tokens": 9525327.0,
"step": 732
},
{
"epoch": 0.46912,
"grad_norm": 3.8517005443573,
"learning_rate": 5e-06,
"loss": 1.3099,
"mean_token_accuracy": 0.650071769952774,
"num_tokens": 9537583.0,
"step": 733
},
{
"epoch": 0.46976,
"grad_norm": 3.4071006774902344,
"learning_rate": 5e-06,
"loss": 1.0845,
"mean_token_accuracy": 0.6916024461388588,
"num_tokens": 9550411.0,
"step": 734
},
{
"epoch": 0.4704,
"grad_norm": 4.703375816345215,
"learning_rate": 5e-06,
"loss": 1.2984,
"mean_token_accuracy": 0.6852922365069389,
"num_tokens": 9561144.0,
"step": 735
},
{
"epoch": 0.47104,
"grad_norm": 3.5826289653778076,
"learning_rate": 5e-06,
"loss": 1.5058,
"mean_token_accuracy": 0.6203412935137749,
"num_tokens": 9575541.0,
"step": 736
},
{
"epoch": 0.47168,
"grad_norm": 3.2071099281311035,
"learning_rate": 5e-06,
"loss": 1.2267,
"mean_token_accuracy": 0.6681589409708977,
"num_tokens": 9591163.0,
"step": 737
},
{
"epoch": 0.47232,
"grad_norm": 3.8028645515441895,
"learning_rate": 5e-06,
"loss": 1.4041,
"mean_token_accuracy": 0.6411704197525978,
"num_tokens": 9604337.0,
"step": 738
},
{
"epoch": 0.47296,
"grad_norm": 3.5578410625457764,
"learning_rate": 5e-06,
"loss": 1.4089,
"mean_token_accuracy": 0.6202419102191925,
"num_tokens": 9618994.0,
"step": 739
},
{
"epoch": 0.4736,
"grad_norm": 4.015564441680908,
"learning_rate": 5e-06,
"loss": 1.3252,
"mean_token_accuracy": 0.6425078436732292,
"num_tokens": 9629590.0,
"step": 740
},
{
"epoch": 0.47424,
"grad_norm": 3.3953940868377686,
"learning_rate": 5e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.6555972173810005,
"num_tokens": 9643383.0,
"step": 741
},
{
"epoch": 0.47488,
"grad_norm": 3.509755849838257,
"learning_rate": 5e-06,
"loss": 1.2274,
"mean_token_accuracy": 0.6479950994253159,
"num_tokens": 9657137.0,
"step": 742
},
{
"epoch": 0.47552,
"grad_norm": 3.403864622116089,
"learning_rate": 5e-06,
"loss": 1.3223,
"mean_token_accuracy": 0.6538697630167007,
"num_tokens": 9670515.0,
"step": 743
},
{
"epoch": 0.47616,
"grad_norm": 3.5815911293029785,
"learning_rate": 5e-06,
"loss": 1.3843,
"mean_token_accuracy": 0.6228384971618652,
"num_tokens": 9682689.0,
"step": 744
},
{
"epoch": 0.4768,
"grad_norm": 4.452811241149902,
"learning_rate": 5e-06,
"loss": 1.3518,
"mean_token_accuracy": 0.6771413907408714,
"num_tokens": 9693372.0,
"step": 745
},
{
"epoch": 0.47744,
"grad_norm": 4.269803524017334,
"learning_rate": 5e-06,
"loss": 1.212,
"mean_token_accuracy": 0.672097809612751,
"num_tokens": 9702991.0,
"step": 746
},
{
"epoch": 0.47808,
"grad_norm": 3.6928703784942627,
"learning_rate": 5e-06,
"loss": 1.3458,
"mean_token_accuracy": 0.6829958707094193,
"num_tokens": 9716472.0,
"step": 747
},
{
"epoch": 0.47872,
"grad_norm": 3.9352831840515137,
"learning_rate": 5e-06,
"loss": 1.4422,
"mean_token_accuracy": 0.6457963958382607,
"num_tokens": 9730346.0,
"step": 748
},
{
"epoch": 0.47936,
"grad_norm": 4.322943687438965,
"learning_rate": 5e-06,
"loss": 1.0575,
"mean_token_accuracy": 0.6857285089790821,
"num_tokens": 9742613.0,
"step": 749
},
{
"epoch": 0.48,
"grad_norm": 3.4020259380340576,
"learning_rate": 5e-06,
"loss": 1.3059,
"mean_token_accuracy": 0.6406174898147583,
"num_tokens": 9754833.0,
"step": 750
},
{
"epoch": 0.48064,
"grad_norm": 3.288209915161133,
"learning_rate": 5e-06,
"loss": 1.3021,
"mean_token_accuracy": 0.6589253880083561,
"num_tokens": 9769316.0,
"step": 751
},
{
"epoch": 0.48128,
"grad_norm": 3.2498161792755127,
"learning_rate": 5e-06,
"loss": 1.4946,
"mean_token_accuracy": 0.6402696147561073,
"num_tokens": 9783768.0,
"step": 752
},
{
"epoch": 0.48192,
"grad_norm": 3.8162779808044434,
"learning_rate": 5e-06,
"loss": 1.3911,
"mean_token_accuracy": 0.6404719427227974,
"num_tokens": 9797279.0,
"step": 753
},
{
"epoch": 0.48256,
"grad_norm": 4.253142833709717,
"learning_rate": 5e-06,
"loss": 1.4797,
"mean_token_accuracy": 0.6313204690814018,
"num_tokens": 9808629.0,
"step": 754
},
{
"epoch": 0.4832,
"grad_norm": 3.716420888900757,
"learning_rate": 5e-06,
"loss": 1.3401,
"mean_token_accuracy": 0.6325643435120583,
"num_tokens": 9821968.0,
"step": 755
},
{
"epoch": 0.48384,
"grad_norm": 3.5335354804992676,
"learning_rate": 5e-06,
"loss": 1.3272,
"mean_token_accuracy": 0.6529423892498016,
"num_tokens": 9835554.0,
"step": 756
},
{
"epoch": 0.48448,
"grad_norm": 3.359344005584717,
"learning_rate": 5e-06,
"loss": 1.3782,
"mean_token_accuracy": 0.6145281083881855,
"num_tokens": 9849101.0,
"step": 757
},
{
"epoch": 0.48512,
"grad_norm": 3.4545371532440186,
"learning_rate": 5e-06,
"loss": 1.5942,
"mean_token_accuracy": 0.6196755021810532,
"num_tokens": 9862033.0,
"step": 758
},
{
"epoch": 0.48576,
"grad_norm": 3.6362133026123047,
"learning_rate": 5e-06,
"loss": 1.3524,
"mean_token_accuracy": 0.6347524076700211,
"num_tokens": 9873692.0,
"step": 759
},
{
"epoch": 0.4864,
"grad_norm": 3.699906826019287,
"learning_rate": 5e-06,
"loss": 1.2751,
"mean_token_accuracy": 0.656343087553978,
"num_tokens": 9885375.0,
"step": 760
},
{
"epoch": 0.48704,
"grad_norm": 3.8104074001312256,
"learning_rate": 5e-06,
"loss": 1.4196,
"mean_token_accuracy": 0.630670964717865,
"num_tokens": 9897179.0,
"step": 761
},
{
"epoch": 0.48768,
"grad_norm": 3.5518436431884766,
"learning_rate": 5e-06,
"loss": 1.1912,
"mean_token_accuracy": 0.6667153909802437,
"num_tokens": 9909370.0,
"step": 762
},
{
"epoch": 0.48832,
"grad_norm": 3.4174013137817383,
"learning_rate": 5e-06,
"loss": 1.4634,
"mean_token_accuracy": 0.6383125334978104,
"num_tokens": 9924593.0,
"step": 763
},
{
"epoch": 0.48896,
"grad_norm": 3.690223217010498,
"learning_rate": 5e-06,
"loss": 1.2632,
"mean_token_accuracy": 0.6747411042451859,
"num_tokens": 9938123.0,
"step": 764
},
{
"epoch": 0.4896,
"grad_norm": 3.189453125,
"learning_rate": 5e-06,
"loss": 1.2344,
"mean_token_accuracy": 0.6770232170820236,
"num_tokens": 9952420.0,
"step": 765
},
{
"epoch": 0.49024,
"grad_norm": 4.607802867889404,
"learning_rate": 5e-06,
"loss": 1.2471,
"mean_token_accuracy": 0.6755125150084496,
"num_tokens": 9962228.0,
"step": 766
},
{
"epoch": 0.49088,
"grad_norm": 3.5634379386901855,
"learning_rate": 5e-06,
"loss": 1.1842,
"mean_token_accuracy": 0.6926667168736458,
"num_tokens": 9974613.0,
"step": 767
},
{
"epoch": 0.49152,
"grad_norm": 3.5588109493255615,
"learning_rate": 5e-06,
"loss": 1.3507,
"mean_token_accuracy": 0.6611402109265327,
"num_tokens": 9988413.0,
"step": 768
},
{
"epoch": 0.49216,
"grad_norm": 3.356700897216797,
"learning_rate": 5e-06,
"loss": 1.6306,
"mean_token_accuracy": 0.6335580386221409,
"num_tokens": 10003108.0,
"step": 769
},
{
"epoch": 0.4928,
"grad_norm": 4.425334453582764,
"learning_rate": 5e-06,
"loss": 1.379,
"mean_token_accuracy": 0.6304365694522858,
"num_tokens": 10015961.0,
"step": 770
},
{
"epoch": 0.49344,
"grad_norm": 3.2346768379211426,
"learning_rate": 5e-06,
"loss": 1.194,
"mean_token_accuracy": 0.6939344108104706,
"num_tokens": 10028885.0,
"step": 771
},
{
"epoch": 0.49408,
"grad_norm": 2.969572067260742,
"learning_rate": 5e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.6672687157988548,
"num_tokens": 10044177.0,
"step": 772
},
{
"epoch": 0.49472,
"grad_norm": 3.9597513675689697,
"learning_rate": 5e-06,
"loss": 1.2808,
"mean_token_accuracy": 0.6557754501700401,
"num_tokens": 10055808.0,
"step": 773
},
{
"epoch": 0.49536,
"grad_norm": 3.450819730758667,
"learning_rate": 5e-06,
"loss": 1.0655,
"mean_token_accuracy": 0.6846578419208527,
"num_tokens": 10071283.0,
"step": 774
},
{
"epoch": 0.496,
"grad_norm": 3.999828338623047,
"learning_rate": 5e-06,
"loss": 1.4067,
"mean_token_accuracy": 0.6274128258228302,
"num_tokens": 10083352.0,
"step": 775
},
{
"epoch": 0.49664,
"grad_norm": 4.048245429992676,
"learning_rate": 5e-06,
"loss": 1.2898,
"mean_token_accuracy": 0.6655979752540588,
"num_tokens": 10095482.0,
"step": 776
},
{
"epoch": 0.49728,
"grad_norm": 3.832430124282837,
"learning_rate": 5e-06,
"loss": 1.3077,
"mean_token_accuracy": 0.6555028632283211,
"num_tokens": 10108448.0,
"step": 777
},
{
"epoch": 0.49792,
"grad_norm": 3.215700626373291,
"learning_rate": 5e-06,
"loss": 1.3706,
"mean_token_accuracy": 0.653311513364315,
"num_tokens": 10122582.0,
"step": 778
},
{
"epoch": 0.49856,
"grad_norm": 3.9083938598632812,
"learning_rate": 5e-06,
"loss": 1.4162,
"mean_token_accuracy": 0.666377916932106,
"num_tokens": 10133857.0,
"step": 779
},
{
"epoch": 0.4992,
"grad_norm": 3.3011085987091064,
"learning_rate": 5e-06,
"loss": 1.4602,
"mean_token_accuracy": 0.6177773475646973,
"num_tokens": 10149390.0,
"step": 780
},
{
"epoch": 0.49984,
"grad_norm": 4.202136516571045,
"learning_rate": 5e-06,
"loss": 1.348,
"mean_token_accuracy": 0.6335294619202614,
"num_tokens": 10160191.0,
"step": 781
},
{
"epoch": 0.50048,
"grad_norm": 3.8100340366363525,
"learning_rate": 5e-06,
"loss": 1.286,
"mean_token_accuracy": 0.664748452603817,
"num_tokens": 10174169.0,
"step": 782
},
{
"epoch": 0.50112,
"grad_norm": 3.2231757640838623,
"learning_rate": 5e-06,
"loss": 1.5273,
"mean_token_accuracy": 0.6323799937963486,
"num_tokens": 10189013.0,
"step": 783
},
{
"epoch": 0.50176,
"grad_norm": 3.380337953567505,
"learning_rate": 5e-06,
"loss": 1.2976,
"mean_token_accuracy": 0.6576580554246902,
"num_tokens": 10205197.0,
"step": 784
},
{
"epoch": 0.5024,
"grad_norm": 3.5312960147857666,
"learning_rate": 5e-06,
"loss": 1.4795,
"mean_token_accuracy": 0.6108041927218437,
"num_tokens": 10218782.0,
"step": 785
},
{
"epoch": 0.50304,
"grad_norm": 3.7805802822113037,
"learning_rate": 5e-06,
"loss": 1.1567,
"mean_token_accuracy": 0.7012158781290054,
"num_tokens": 10229754.0,
"step": 786
},
{
"epoch": 0.50368,
"grad_norm": 3.575208902359009,
"learning_rate": 5e-06,
"loss": 1.2914,
"mean_token_accuracy": 0.6698039025068283,
"num_tokens": 10243519.0,
"step": 787
},
{
"epoch": 0.50432,
"grad_norm": 4.018414497375488,
"learning_rate": 5e-06,
"loss": 1.2843,
"mean_token_accuracy": 0.6500632241368294,
"num_tokens": 10255436.0,
"step": 788
},
{
"epoch": 0.50496,
"grad_norm": 3.3472957611083984,
"learning_rate": 5e-06,
"loss": 1.409,
"mean_token_accuracy": 0.6223405599594116,
"num_tokens": 10270218.0,
"step": 789
},
{
"epoch": 0.5056,
"grad_norm": 3.555922031402588,
"learning_rate": 5e-06,
"loss": 1.4861,
"mean_token_accuracy": 0.6319947242736816,
"num_tokens": 10283818.0,
"step": 790
},
{
"epoch": 0.50624,
"grad_norm": 3.2534327507019043,
"learning_rate": 5e-06,
"loss": 1.505,
"mean_token_accuracy": 0.6172455549240112,
"num_tokens": 10299119.0,
"step": 791
},
{
"epoch": 0.50688,
"grad_norm": 3.78558087348938,
"learning_rate": 5e-06,
"loss": 1.3765,
"mean_token_accuracy": 0.6513939723372459,
"num_tokens": 10311103.0,
"step": 792
},
{
"epoch": 0.50752,
"grad_norm": 3.426884412765503,
"learning_rate": 5e-06,
"loss": 1.3942,
"mean_token_accuracy": 0.6268276050686836,
"num_tokens": 10324256.0,
"step": 793
},
{
"epoch": 0.50816,
"grad_norm": 3.586442470550537,
"learning_rate": 5e-06,
"loss": 1.291,
"mean_token_accuracy": 0.6560362279415131,
"num_tokens": 10336275.0,
"step": 794
},
{
"epoch": 0.5088,
"grad_norm": 3.6246700286865234,
"learning_rate": 5e-06,
"loss": 1.4371,
"mean_token_accuracy": 0.64004335552454,
"num_tokens": 10348939.0,
"step": 795
},
{
"epoch": 0.50944,
"grad_norm": 3.8206660747528076,
"learning_rate": 5e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.6557733938097954,
"num_tokens": 10360108.0,
"step": 796
},
{
"epoch": 0.51008,
"grad_norm": 4.05738639831543,
"learning_rate": 5e-06,
"loss": 1.3265,
"mean_token_accuracy": 0.6359497681260109,
"num_tokens": 10371144.0,
"step": 797
},
{
"epoch": 0.51072,
"grad_norm": 3.9649102687835693,
"learning_rate": 5e-06,
"loss": 1.4398,
"mean_token_accuracy": 0.6443544700741768,
"num_tokens": 10381537.0,
"step": 798
},
{
"epoch": 0.51136,
"grad_norm": 3.3141987323760986,
"learning_rate": 5e-06,
"loss": 1.4374,
"mean_token_accuracy": 0.6308365762233734,
"num_tokens": 10396539.0,
"step": 799
},
{
"epoch": 0.512,
"grad_norm": 3.1399598121643066,
"learning_rate": 5e-06,
"loss": 1.3463,
"mean_token_accuracy": 0.6553637236356735,
"num_tokens": 10412962.0,
"step": 800
},
{
"epoch": 0.51264,
"grad_norm": 3.690521478652954,
"learning_rate": 5e-06,
"loss": 1.4432,
"mean_token_accuracy": 0.6525074169039726,
"num_tokens": 10426314.0,
"step": 801
},
{
"epoch": 0.51328,
"grad_norm": 3.5348432064056396,
"learning_rate": 5e-06,
"loss": 1.5491,
"mean_token_accuracy": 0.5989897102117538,
"num_tokens": 10439348.0,
"step": 802
},
{
"epoch": 0.51392,
"grad_norm": 3.760218620300293,
"learning_rate": 5e-06,
"loss": 1.0484,
"mean_token_accuracy": 0.6977821663022041,
"num_tokens": 10449973.0,
"step": 803
},
{
"epoch": 0.51456,
"grad_norm": 3.7311551570892334,
"learning_rate": 5e-06,
"loss": 1.4811,
"mean_token_accuracy": 0.630589596927166,
"num_tokens": 10462019.0,
"step": 804
},
{
"epoch": 0.5152,
"grad_norm": 3.4669084548950195,
"learning_rate": 5e-06,
"loss": 1.3778,
"mean_token_accuracy": 0.6389184445142746,
"num_tokens": 10474583.0,
"step": 805
},
{
"epoch": 0.51584,
"grad_norm": 3.2782742977142334,
"learning_rate": 5e-06,
"loss": 1.2256,
"mean_token_accuracy": 0.6817874610424042,
"num_tokens": 10488568.0,
"step": 806
},
{
"epoch": 0.51648,
"grad_norm": 4.345005035400391,
"learning_rate": 5e-06,
"loss": 1.3498,
"mean_token_accuracy": 0.6508120521903038,
"num_tokens": 10500968.0,
"step": 807
},
{
"epoch": 0.51712,
"grad_norm": 3.8742589950561523,
"learning_rate": 5e-06,
"loss": 1.4047,
"mean_token_accuracy": 0.634697936475277,
"num_tokens": 10513586.0,
"step": 808
},
{
"epoch": 0.51776,
"grad_norm": 3.4968934059143066,
"learning_rate": 5e-06,
"loss": 1.4397,
"mean_token_accuracy": 0.6320827975869179,
"num_tokens": 10528263.0,
"step": 809
},
{
"epoch": 0.5184,
"grad_norm": 3.89860463142395,
"learning_rate": 5e-06,
"loss": 1.299,
"mean_token_accuracy": 0.6727789714932442,
"num_tokens": 10539352.0,
"step": 810
},
{
"epoch": 0.51904,
"grad_norm": 3.1833720207214355,
"learning_rate": 5e-06,
"loss": 1.2753,
"mean_token_accuracy": 0.6482102572917938,
"num_tokens": 10553515.0,
"step": 811
},
{
"epoch": 0.51968,
"grad_norm": 3.3082292079925537,
"learning_rate": 5e-06,
"loss": 1.2998,
"mean_token_accuracy": 0.6546992510557175,
"num_tokens": 10566663.0,
"step": 812
},
{
"epoch": 0.52032,
"grad_norm": 3.6185340881347656,
"learning_rate": 5e-06,
"loss": 1.3815,
"mean_token_accuracy": 0.6625709310173988,
"num_tokens": 10579584.0,
"step": 813
},
{
"epoch": 0.52096,
"grad_norm": 3.55534291267395,
"learning_rate": 5e-06,
"loss": 1.1128,
"mean_token_accuracy": 0.6956586241722107,
"num_tokens": 10591791.0,
"step": 814
},
{
"epoch": 0.5216,
"grad_norm": 3.650907516479492,
"learning_rate": 5e-06,
"loss": 1.4477,
"mean_token_accuracy": 0.624105378985405,
"num_tokens": 10604975.0,
"step": 815
},
{
"epoch": 0.52224,
"grad_norm": 3.9432995319366455,
"learning_rate": 5e-06,
"loss": 1.3038,
"mean_token_accuracy": 0.6531900316476822,
"num_tokens": 10616449.0,
"step": 816
},
{
"epoch": 0.52288,
"grad_norm": 3.8777639865875244,
"learning_rate": 5e-06,
"loss": 1.47,
"mean_token_accuracy": 0.6426081731915474,
"num_tokens": 10626994.0,
"step": 817
},
{
"epoch": 0.52352,
"grad_norm": 3.064539909362793,
"learning_rate": 5e-06,
"loss": 1.2586,
"mean_token_accuracy": 0.6581793427467346,
"num_tokens": 10642855.0,
"step": 818
},
{
"epoch": 0.52416,
"grad_norm": 3.3149263858795166,
"learning_rate": 5e-06,
"loss": 1.4482,
"mean_token_accuracy": 0.6364821642637253,
"num_tokens": 10656056.0,
"step": 819
},
{
"epoch": 0.5248,
"grad_norm": 4.199079990386963,
"learning_rate": 5e-06,
"loss": 1.4089,
"mean_token_accuracy": 0.6289070248603821,
"num_tokens": 10668057.0,
"step": 820
},
{
"epoch": 0.52544,
"grad_norm": 3.142550468444824,
"learning_rate": 5e-06,
"loss": 1.526,
"mean_token_accuracy": 0.6158655360341072,
"num_tokens": 10682235.0,
"step": 821
},
{
"epoch": 0.52608,
"grad_norm": 3.3503355979919434,
"learning_rate": 5e-06,
"loss": 1.2846,
"mean_token_accuracy": 0.6406090259552002,
"num_tokens": 10697588.0,
"step": 822
},
{
"epoch": 0.52672,
"grad_norm": 3.7879579067230225,
"learning_rate": 5e-06,
"loss": 1.4343,
"mean_token_accuracy": 0.6635532379150391,
"num_tokens": 10708737.0,
"step": 823
},
{
"epoch": 0.52736,
"grad_norm": 3.9766318798065186,
"learning_rate": 5e-06,
"loss": 1.352,
"mean_token_accuracy": 0.6262383349239826,
"num_tokens": 10721121.0,
"step": 824
},
{
"epoch": 0.528,
"grad_norm": 3.3426828384399414,
"learning_rate": 5e-06,
"loss": 1.2434,
"mean_token_accuracy": 0.6687643975019455,
"num_tokens": 10734364.0,
"step": 825
},
{
"epoch": 0.52864,
"grad_norm": 3.411301612854004,
"learning_rate": 5e-06,
"loss": 1.0815,
"mean_token_accuracy": 0.695383183658123,
"num_tokens": 10747101.0,
"step": 826
},
{
"epoch": 0.52928,
"grad_norm": 4.2775044441223145,
"learning_rate": 5e-06,
"loss": 1.1268,
"mean_token_accuracy": 0.7007181644439697,
"num_tokens": 10757435.0,
"step": 827
},
{
"epoch": 0.52992,
"grad_norm": 3.670020341873169,
"learning_rate": 5e-06,
"loss": 1.2358,
"mean_token_accuracy": 0.6553446725010872,
"num_tokens": 10768682.0,
"step": 828
},
{
"epoch": 0.53056,
"grad_norm": 3.6720056533813477,
"learning_rate": 5e-06,
"loss": 1.4772,
"mean_token_accuracy": 0.6402060613036156,
"num_tokens": 10781720.0,
"step": 829
},
{
"epoch": 0.5312,
"grad_norm": 4.194923400878906,
"learning_rate": 5e-06,
"loss": 1.1888,
"mean_token_accuracy": 0.6686526387929916,
"num_tokens": 10794142.0,
"step": 830
},
{
"epoch": 0.53184,
"grad_norm": 3.1744613647460938,
"learning_rate": 5e-06,
"loss": 1.4092,
"mean_token_accuracy": 0.6328324228525162,
"num_tokens": 10809011.0,
"step": 831
},
{
"epoch": 0.53248,
"grad_norm": 3.844196319580078,
"learning_rate": 5e-06,
"loss": 1.0955,
"mean_token_accuracy": 0.6974828243255615,
"num_tokens": 10819963.0,
"step": 832
},
{
"epoch": 0.53312,
"grad_norm": 3.668311834335327,
"learning_rate": 5e-06,
"loss": 1.2182,
"mean_token_accuracy": 0.6790367737412453,
"num_tokens": 10834237.0,
"step": 833
},
{
"epoch": 0.53376,
"grad_norm": 3.610236406326294,
"learning_rate": 5e-06,
"loss": 1.078,
"mean_token_accuracy": 0.7012447938323021,
"num_tokens": 10847095.0,
"step": 834
},
{
"epoch": 0.5344,
"grad_norm": 3.7682337760925293,
"learning_rate": 5e-06,
"loss": 1.1083,
"mean_token_accuracy": 0.6979804188013077,
"num_tokens": 10859861.0,
"step": 835
},
{
"epoch": 0.53504,
"grad_norm": 3.720351457595825,
"learning_rate": 5e-06,
"loss": 1.3023,
"mean_token_accuracy": 0.6436434611678123,
"num_tokens": 10871706.0,
"step": 836
},
{
"epoch": 0.53568,
"grad_norm": 3.608431816101074,
"learning_rate": 5e-06,
"loss": 1.5091,
"mean_token_accuracy": 0.613635927438736,
"num_tokens": 10884615.0,
"step": 837
},
{
"epoch": 0.53632,
"grad_norm": 3.321657419204712,
"learning_rate": 5e-06,
"loss": 1.1323,
"mean_token_accuracy": 0.6927084550261497,
"num_tokens": 10898152.0,
"step": 838
},
{
"epoch": 0.53696,
"grad_norm": 2.9468841552734375,
"learning_rate": 5e-06,
"loss": 1.3507,
"mean_token_accuracy": 0.6446966454386711,
"num_tokens": 10915868.0,
"step": 839
},
{
"epoch": 0.5376,
"grad_norm": 3.565668821334839,
"learning_rate": 5e-06,
"loss": 1.4081,
"mean_token_accuracy": 0.6363908722996712,
"num_tokens": 10930562.0,
"step": 840
},
{
"epoch": 0.53824,
"grad_norm": 3.9890897274017334,
"learning_rate": 5e-06,
"loss": 1.4644,
"mean_token_accuracy": 0.6419349610805511,
"num_tokens": 10942958.0,
"step": 841
},
{
"epoch": 0.53888,
"grad_norm": 3.5691657066345215,
"learning_rate": 5e-06,
"loss": 1.461,
"mean_token_accuracy": 0.6422456279397011,
"num_tokens": 10955566.0,
"step": 842
},
{
"epoch": 0.53952,
"grad_norm": 3.0054261684417725,
"learning_rate": 5e-06,
"loss": 1.32,
"mean_token_accuracy": 0.6669855192303658,
"num_tokens": 10971064.0,
"step": 843
},
{
"epoch": 0.54016,
"grad_norm": 2.9434778690338135,
"learning_rate": 5e-06,
"loss": 1.343,
"mean_token_accuracy": 0.6634941324591637,
"num_tokens": 10987737.0,
"step": 844
},
{
"epoch": 0.5408,
"grad_norm": 4.207048416137695,
"learning_rate": 5e-06,
"loss": 1.338,
"mean_token_accuracy": 0.6447890773415565,
"num_tokens": 10998859.0,
"step": 845
},
{
"epoch": 0.54144,
"grad_norm": 3.3798792362213135,
"learning_rate": 5e-06,
"loss": 1.5105,
"mean_token_accuracy": 0.6433539763092995,
"num_tokens": 11013214.0,
"step": 846
},
{
"epoch": 0.54208,
"grad_norm": 3.163572311401367,
"learning_rate": 5e-06,
"loss": 1.3399,
"mean_token_accuracy": 0.6501183435320854,
"num_tokens": 11028566.0,
"step": 847
},
{
"epoch": 0.54272,
"grad_norm": 3.5735156536102295,
"learning_rate": 5e-06,
"loss": 1.6305,
"mean_token_accuracy": 0.6012577600777149,
"num_tokens": 11043246.0,
"step": 848
},
{
"epoch": 0.54336,
"grad_norm": 4.034946441650391,
"learning_rate": 5e-06,
"loss": 1.1873,
"mean_token_accuracy": 0.6620588451623917,
"num_tokens": 11055228.0,
"step": 849
},
{
"epoch": 0.544,
"grad_norm": 3.2156589031219482,
"learning_rate": 5e-06,
"loss": 1.4072,
"mean_token_accuracy": 0.6418510600924492,
"num_tokens": 11068267.0,
"step": 850
},
{
"epoch": 0.54464,
"grad_norm": 4.0673723220825195,
"learning_rate": 5e-06,
"loss": 1.2545,
"mean_token_accuracy": 0.6594027280807495,
"num_tokens": 11080978.0,
"step": 851
},
{
"epoch": 0.54528,
"grad_norm": 3.5857112407684326,
"learning_rate": 5e-06,
"loss": 1.4054,
"mean_token_accuracy": 0.6293277516961098,
"num_tokens": 11096210.0,
"step": 852
},
{
"epoch": 0.54592,
"grad_norm": 3.829974889755249,
"learning_rate": 5e-06,
"loss": 1.3174,
"mean_token_accuracy": 0.6636649072170258,
"num_tokens": 11108254.0,
"step": 853
},
{
"epoch": 0.54656,
"grad_norm": 3.5567145347595215,
"learning_rate": 5e-06,
"loss": 1.2567,
"mean_token_accuracy": 0.677757516503334,
"num_tokens": 11120465.0,
"step": 854
},
{
"epoch": 0.5472,
"grad_norm": 4.473601341247559,
"learning_rate": 5e-06,
"loss": 1.2163,
"mean_token_accuracy": 0.6928970888257027,
"num_tokens": 11131956.0,
"step": 855
},
{
"epoch": 0.54784,
"grad_norm": 3.658292293548584,
"learning_rate": 5e-06,
"loss": 1.2401,
"mean_token_accuracy": 0.6552563831210136,
"num_tokens": 11144655.0,
"step": 856
},
{
"epoch": 0.54848,
"grad_norm": 3.061565399169922,
"learning_rate": 5e-06,
"loss": 1.1552,
"mean_token_accuracy": 0.6949977725744247,
"num_tokens": 11159566.0,
"step": 857
},
{
"epoch": 0.54912,
"grad_norm": 3.8165862560272217,
"learning_rate": 5e-06,
"loss": 1.2363,
"mean_token_accuracy": 0.6678136140108109,
"num_tokens": 11172278.0,
"step": 858
},
{
"epoch": 0.54976,
"grad_norm": 3.937960147857666,
"learning_rate": 5e-06,
"loss": 1.1055,
"mean_token_accuracy": 0.7062254995107651,
"num_tokens": 11183273.0,
"step": 859
},
{
"epoch": 0.5504,
"grad_norm": 3.9735426902770996,
"learning_rate": 5e-06,
"loss": 1.2674,
"mean_token_accuracy": 0.6709722802042961,
"num_tokens": 11194956.0,
"step": 860
},
{
"epoch": 0.55104,
"grad_norm": 3.741502523422241,
"learning_rate": 5e-06,
"loss": 1.4785,
"mean_token_accuracy": 0.6148476675152779,
"num_tokens": 11209741.0,
"step": 861
},
{
"epoch": 0.55168,
"grad_norm": 3.544828176498413,
"learning_rate": 5e-06,
"loss": 1.3682,
"mean_token_accuracy": 0.6354441791772842,
"num_tokens": 11222629.0,
"step": 862
},
{
"epoch": 0.55232,
"grad_norm": 3.3560214042663574,
"learning_rate": 5e-06,
"loss": 1.1048,
"mean_token_accuracy": 0.6948365420103073,
"num_tokens": 11237834.0,
"step": 863
},
{
"epoch": 0.55296,
"grad_norm": 3.512924909591675,
"learning_rate": 5e-06,
"loss": 1.181,
"mean_token_accuracy": 0.6836326494812965,
"num_tokens": 11250638.0,
"step": 864
},
{
"epoch": 0.5536,
"grad_norm": 4.28767728805542,
"learning_rate": 5e-06,
"loss": 1.475,
"mean_token_accuracy": 0.6338236667215824,
"num_tokens": 11261887.0,
"step": 865
},
{
"epoch": 0.55424,
"grad_norm": 3.2134881019592285,
"learning_rate": 5e-06,
"loss": 1.3229,
"mean_token_accuracy": 0.6470964848995209,
"num_tokens": 11275931.0,
"step": 866
},
{
"epoch": 0.55488,
"grad_norm": 3.689152240753174,
"learning_rate": 5e-06,
"loss": 1.3717,
"mean_token_accuracy": 0.6691553071141243,
"num_tokens": 11287528.0,
"step": 867
},
{
"epoch": 0.55552,
"grad_norm": 3.289281129837036,
"learning_rate": 5e-06,
"loss": 1.2839,
"mean_token_accuracy": 0.6697202101349831,
"num_tokens": 11300231.0,
"step": 868
},
{
"epoch": 0.55616,
"grad_norm": 3.278754234313965,
"learning_rate": 5e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.6946472376585007,
"num_tokens": 11315024.0,
"step": 869
},
{
"epoch": 0.5568,
"grad_norm": 3.2673239707946777,
"learning_rate": 5e-06,
"loss": 1.4874,
"mean_token_accuracy": 0.6627454794943333,
"num_tokens": 11329475.0,
"step": 870
},
{
"epoch": 0.55744,
"grad_norm": 3.1076149940490723,
"learning_rate": 5e-06,
"loss": 1.3683,
"mean_token_accuracy": 0.6290438398718834,
"num_tokens": 11343973.0,
"step": 871
},
{
"epoch": 0.55808,
"grad_norm": 3.526763439178467,
"learning_rate": 5e-06,
"loss": 1.3713,
"mean_token_accuracy": 0.6168685257434845,
"num_tokens": 11356517.0,
"step": 872
},
{
"epoch": 0.55872,
"grad_norm": 3.46929931640625,
"learning_rate": 5e-06,
"loss": 1.3024,
"mean_token_accuracy": 0.6537005454301834,
"num_tokens": 11369229.0,
"step": 873
},
{
"epoch": 0.55936,
"grad_norm": 3.599717617034912,
"learning_rate": 5e-06,
"loss": 1.3816,
"mean_token_accuracy": 0.641513504087925,
"num_tokens": 11382702.0,
"step": 874
},
{
"epoch": 0.56,
"grad_norm": 3.80094313621521,
"learning_rate": 5e-06,
"loss": 1.5008,
"mean_token_accuracy": 0.6274667903780937,
"num_tokens": 11396562.0,
"step": 875
},
{
"epoch": 0.56064,
"grad_norm": 4.2999067306518555,
"learning_rate": 5e-06,
"loss": 1.2018,
"mean_token_accuracy": 0.6762094050645828,
"num_tokens": 11406774.0,
"step": 876
},
{
"epoch": 0.56128,
"grad_norm": 3.715298652648926,
"learning_rate": 5e-06,
"loss": 1.2514,
"mean_token_accuracy": 0.6620960757136345,
"num_tokens": 11418251.0,
"step": 877
},
{
"epoch": 0.56192,
"grad_norm": 3.0805916786193848,
"learning_rate": 5e-06,
"loss": 1.1502,
"mean_token_accuracy": 0.684480644762516,
"num_tokens": 11433197.0,
"step": 878
},
{
"epoch": 0.56256,
"grad_norm": 3.6326444149017334,
"learning_rate": 5e-06,
"loss": 1.2656,
"mean_token_accuracy": 0.660808764398098,
"num_tokens": 11446639.0,
"step": 879
},
{
"epoch": 0.5632,
"grad_norm": 12.266148567199707,
"learning_rate": 5e-06,
"loss": 1.286,
"mean_token_accuracy": 0.6693281307816505,
"num_tokens": 11458609.0,
"step": 880
},
{
"epoch": 0.56384,
"grad_norm": 3.6536591053009033,
"learning_rate": 5e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.6694196611642838,
"num_tokens": 11470645.0,
"step": 881
},
{
"epoch": 0.56448,
"grad_norm": 3.287473201751709,
"learning_rate": 5e-06,
"loss": 1.3294,
"mean_token_accuracy": 0.6692755967378616,
"num_tokens": 11484303.0,
"step": 882
},
{
"epoch": 0.56512,
"grad_norm": 3.7565791606903076,
"learning_rate": 5e-06,
"loss": 1.251,
"mean_token_accuracy": 0.6664244830608368,
"num_tokens": 11496299.0,
"step": 883
},
{
"epoch": 0.56576,
"grad_norm": 3.544475793838501,
"learning_rate": 5e-06,
"loss": 1.4526,
"mean_token_accuracy": 0.6100342273712158,
"num_tokens": 11510676.0,
"step": 884
},
{
"epoch": 0.5664,
"grad_norm": 3.682511568069458,
"learning_rate": 5e-06,
"loss": 1.4142,
"mean_token_accuracy": 0.6500721573829651,
"num_tokens": 11523371.0,
"step": 885
},
{
"epoch": 0.56704,
"grad_norm": 3.6271486282348633,
"learning_rate": 5e-06,
"loss": 1.1237,
"mean_token_accuracy": 0.6834971457719803,
"num_tokens": 11536061.0,
"step": 886
},
{
"epoch": 0.56768,
"grad_norm": 3.1198318004608154,
"learning_rate": 5e-06,
"loss": 1.2309,
"mean_token_accuracy": 0.658136211335659,
"num_tokens": 11550795.0,
"step": 887
},
{
"epoch": 0.56832,
"grad_norm": 3.9022724628448486,
"learning_rate": 5e-06,
"loss": 1.3044,
"mean_token_accuracy": 0.6979828551411629,
"num_tokens": 11564562.0,
"step": 888
},
{
"epoch": 0.56896,
"grad_norm": 3.295694351196289,
"learning_rate": 5e-06,
"loss": 1.4184,
"mean_token_accuracy": 0.60136728733778,
"num_tokens": 11578577.0,
"step": 889
},
{
"epoch": 0.5696,
"grad_norm": 3.0561180114746094,
"learning_rate": 5e-06,
"loss": 1.3529,
"mean_token_accuracy": 0.6563450619578362,
"num_tokens": 11594404.0,
"step": 890
},
{
"epoch": 0.57024,
"grad_norm": 3.44431471824646,
"learning_rate": 5e-06,
"loss": 1.1605,
"mean_token_accuracy": 0.6642890870571136,
"num_tokens": 11605723.0,
"step": 891
},
{
"epoch": 0.57088,
"grad_norm": 4.037685871124268,
"learning_rate": 5e-06,
"loss": 1.2558,
"mean_token_accuracy": 0.6648613065481186,
"num_tokens": 11619031.0,
"step": 892
},
{
"epoch": 0.57152,
"grad_norm": 3.2583799362182617,
"learning_rate": 5e-06,
"loss": 1.3105,
"mean_token_accuracy": 0.6500160917639732,
"num_tokens": 11634316.0,
"step": 893
},
{
"epoch": 0.57216,
"grad_norm": 3.2072439193725586,
"learning_rate": 5e-06,
"loss": 1.4559,
"mean_token_accuracy": 0.6469361782073975,
"num_tokens": 11650239.0,
"step": 894
},
{
"epoch": 0.5728,
"grad_norm": 3.4376208782196045,
"learning_rate": 5e-06,
"loss": 1.3858,
"mean_token_accuracy": 0.6572685986757278,
"num_tokens": 11662751.0,
"step": 895
},
{
"epoch": 0.57344,
"grad_norm": 3.647529363632202,
"learning_rate": 5e-06,
"loss": 1.3375,
"mean_token_accuracy": 0.6592177748680115,
"num_tokens": 11675377.0,
"step": 896
},
{
"epoch": 0.57408,
"grad_norm": 3.332850217819214,
"learning_rate": 5e-06,
"loss": 1.3343,
"mean_token_accuracy": 0.6491860672831535,
"num_tokens": 11688675.0,
"step": 897
},
{
"epoch": 0.57472,
"grad_norm": 4.066124439239502,
"learning_rate": 5e-06,
"loss": 1.3361,
"mean_token_accuracy": 0.6541391238570213,
"num_tokens": 11700393.0,
"step": 898
},
{
"epoch": 0.57536,
"grad_norm": 3.341097593307495,
"learning_rate": 5e-06,
"loss": 1.2515,
"mean_token_accuracy": 0.6574959680438042,
"num_tokens": 11714611.0,
"step": 899
},
{
"epoch": 0.576,
"grad_norm": 3.0946879386901855,
"learning_rate": 5e-06,
"loss": 1.242,
"mean_token_accuracy": 0.6756256222724915,
"num_tokens": 11731479.0,
"step": 900
},
{
"epoch": 0.57664,
"grad_norm": 3.3247451782226562,
"learning_rate": 5e-06,
"loss": 1.4695,
"mean_token_accuracy": 0.6269465908408165,
"num_tokens": 11748872.0,
"step": 901
},
{
"epoch": 0.57728,
"grad_norm": 3.942417860031128,
"learning_rate": 5e-06,
"loss": 1.3982,
"mean_token_accuracy": 0.6202432103455067,
"num_tokens": 11760014.0,
"step": 902
},
{
"epoch": 0.57792,
"grad_norm": 3.633100986480713,
"learning_rate": 5e-06,
"loss": 1.4247,
"mean_token_accuracy": 0.6394501402974129,
"num_tokens": 11773867.0,
"step": 903
},
{
"epoch": 0.57856,
"grad_norm": 3.383073568344116,
"learning_rate": 5e-06,
"loss": 1.1386,
"mean_token_accuracy": 0.697243720293045,
"num_tokens": 11787283.0,
"step": 904
},
{
"epoch": 0.5792,
"grad_norm": 3.678783416748047,
"learning_rate": 5e-06,
"loss": 1.3926,
"mean_token_accuracy": 0.6641874313354492,
"num_tokens": 11799263.0,
"step": 905
},
{
"epoch": 0.57984,
"grad_norm": 3.2661468982696533,
"learning_rate": 5e-06,
"loss": 1.5136,
"mean_token_accuracy": 0.6076219081878662,
"num_tokens": 11815606.0,
"step": 906
},
{
"epoch": 0.58048,
"grad_norm": 3.52829909324646,
"learning_rate": 5e-06,
"loss": 1.2213,
"mean_token_accuracy": 0.6809684634208679,
"num_tokens": 11829589.0,
"step": 907
},
{
"epoch": 0.58112,
"grad_norm": 3.6113576889038086,
"learning_rate": 5e-06,
"loss": 1.3111,
"mean_token_accuracy": 0.6847885251045227,
"num_tokens": 11842023.0,
"step": 908
},
{
"epoch": 0.58176,
"grad_norm": 4.104685306549072,
"learning_rate": 5e-06,
"loss": 1.4434,
"mean_token_accuracy": 0.6598182618618011,
"num_tokens": 11852988.0,
"step": 909
},
{
"epoch": 0.5824,
"grad_norm": 3.4313085079193115,
"learning_rate": 5e-06,
"loss": 1.4167,
"mean_token_accuracy": 0.6736102141439915,
"num_tokens": 11866706.0,
"step": 910
},
{
"epoch": 0.58304,
"grad_norm": 3.2502808570861816,
"learning_rate": 5e-06,
"loss": 1.3682,
"mean_token_accuracy": 0.6428176760673523,
"num_tokens": 11882889.0,
"step": 911
},
{
"epoch": 0.58368,
"grad_norm": 3.662310838699341,
"learning_rate": 5e-06,
"loss": 1.0834,
"mean_token_accuracy": 0.715592160820961,
"num_tokens": 11895792.0,
"step": 912
},
{
"epoch": 0.58432,
"grad_norm": 3.0405428409576416,
"learning_rate": 5e-06,
"loss": 1.2439,
"mean_token_accuracy": 0.6653807386755943,
"num_tokens": 11911225.0,
"step": 913
},
{
"epoch": 0.58496,
"grad_norm": 3.550328016281128,
"learning_rate": 5e-06,
"loss": 1.3896,
"mean_token_accuracy": 0.6358233094215393,
"num_tokens": 11924073.0,
"step": 914
},
{
"epoch": 0.5856,
"grad_norm": 3.2749056816101074,
"learning_rate": 5e-06,
"loss": 1.3361,
"mean_token_accuracy": 0.6581274121999741,
"num_tokens": 11938332.0,
"step": 915
},
{
"epoch": 0.58624,
"grad_norm": 3.873444080352783,
"learning_rate": 5e-06,
"loss": 1.3269,
"mean_token_accuracy": 0.6423259451985359,
"num_tokens": 11950429.0,
"step": 916
},
{
"epoch": 0.58688,
"grad_norm": 3.691632032394409,
"learning_rate": 5e-06,
"loss": 1.3576,
"mean_token_accuracy": 0.6709922403097153,
"num_tokens": 11962979.0,
"step": 917
},
{
"epoch": 0.58752,
"grad_norm": 3.1465516090393066,
"learning_rate": 5e-06,
"loss": 1.3155,
"mean_token_accuracy": 0.6728775128722191,
"num_tokens": 11978492.0,
"step": 918
},
{
"epoch": 0.58816,
"grad_norm": 3.738511562347412,
"learning_rate": 5e-06,
"loss": 1.1487,
"mean_token_accuracy": 0.706301674246788,
"num_tokens": 11991139.0,
"step": 919
},
{
"epoch": 0.5888,
"grad_norm": 3.288872241973877,
"learning_rate": 5e-06,
"loss": 1.3725,
"mean_token_accuracy": 0.6539236456155777,
"num_tokens": 12005815.0,
"step": 920
},
{
"epoch": 0.58944,
"grad_norm": 3.644181966781616,
"learning_rate": 5e-06,
"loss": 1.2652,
"mean_token_accuracy": 0.6836251989006996,
"num_tokens": 12017897.0,
"step": 921
},
{
"epoch": 0.59008,
"grad_norm": 3.8078083992004395,
"learning_rate": 5e-06,
"loss": 1.3801,
"mean_token_accuracy": 0.6469420120120049,
"num_tokens": 12030359.0,
"step": 922
},
{
"epoch": 0.59072,
"grad_norm": 3.2687323093414307,
"learning_rate": 5e-06,
"loss": 1.4049,
"mean_token_accuracy": 0.6341100111603737,
"num_tokens": 12044729.0,
"step": 923
},
{
"epoch": 0.59136,
"grad_norm": 3.4478020668029785,
"learning_rate": 5e-06,
"loss": 1.4287,
"mean_token_accuracy": 0.6452651098370552,
"num_tokens": 12058788.0,
"step": 924
},
{
"epoch": 0.592,
"grad_norm": 4.092494010925293,
"learning_rate": 5e-06,
"loss": 1.1304,
"mean_token_accuracy": 0.6950362101197243,
"num_tokens": 12069502.0,
"step": 925
},
{
"epoch": 0.59264,
"grad_norm": 4.566901683807373,
"learning_rate": 5e-06,
"loss": 1.261,
"mean_token_accuracy": 0.6621346473693848,
"num_tokens": 12080932.0,
"step": 926
},
{
"epoch": 0.59328,
"grad_norm": 3.4059062004089355,
"learning_rate": 5e-06,
"loss": 1.291,
"mean_token_accuracy": 0.6705317497253418,
"num_tokens": 12094725.0,
"step": 927
},
{
"epoch": 0.59392,
"grad_norm": 4.018156051635742,
"learning_rate": 5e-06,
"loss": 1.4457,
"mean_token_accuracy": 0.6595090329647064,
"num_tokens": 12107082.0,
"step": 928
},
{
"epoch": 0.59456,
"grad_norm": 3.448580741882324,
"learning_rate": 5e-06,
"loss": 1.2716,
"mean_token_accuracy": 0.6856422200798988,
"num_tokens": 12121239.0,
"step": 929
},
{
"epoch": 0.5952,
"grad_norm": 3.425841808319092,
"learning_rate": 5e-06,
"loss": 1.3174,
"mean_token_accuracy": 0.6534383073449135,
"num_tokens": 12134626.0,
"step": 930
},
{
"epoch": 0.59584,
"grad_norm": 4.416814804077148,
"learning_rate": 5e-06,
"loss": 1.2661,
"mean_token_accuracy": 0.6484142020344734,
"num_tokens": 12145951.0,
"step": 931
},
{
"epoch": 0.59648,
"grad_norm": 3.968085765838623,
"learning_rate": 5e-06,
"loss": 1.4512,
"mean_token_accuracy": 0.6346799582242966,
"num_tokens": 12157958.0,
"step": 932
},
{
"epoch": 0.59712,
"grad_norm": 3.6708478927612305,
"learning_rate": 5e-06,
"loss": 1.2783,
"mean_token_accuracy": 0.6855080351233482,
"num_tokens": 12170548.0,
"step": 933
},
{
"epoch": 0.59776,
"grad_norm": 3.8740973472595215,
"learning_rate": 5e-06,
"loss": 1.1046,
"mean_token_accuracy": 0.704432986676693,
"num_tokens": 12185292.0,
"step": 934
},
{
"epoch": 0.5984,
"grad_norm": 3.4846086502075195,
"learning_rate": 5e-06,
"loss": 1.3035,
"mean_token_accuracy": 0.6759809032082558,
"num_tokens": 12198081.0,
"step": 935
},
{
"epoch": 0.59904,
"grad_norm": 3.027975082397461,
"learning_rate": 5e-06,
"loss": 1.2897,
"mean_token_accuracy": 0.6573361679911613,
"num_tokens": 12214742.0,
"step": 936
},
{
"epoch": 0.59968,
"grad_norm": 3.879801034927368,
"learning_rate": 5e-06,
"loss": 1.3042,
"mean_token_accuracy": 0.641165092587471,
"num_tokens": 12225671.0,
"step": 937
},
{
"epoch": 0.60032,
"grad_norm": 3.933652877807617,
"learning_rate": 5e-06,
"loss": 1.0435,
"mean_token_accuracy": 0.7146632373332977,
"num_tokens": 12239781.0,
"step": 938
},
{
"epoch": 0.60096,
"grad_norm": 4.3125786781311035,
"learning_rate": 5e-06,
"loss": 1.2013,
"mean_token_accuracy": 0.6595223546028137,
"num_tokens": 12250660.0,
"step": 939
},
{
"epoch": 0.6016,
"grad_norm": 3.671967029571533,
"learning_rate": 5e-06,
"loss": 1.4127,
"mean_token_accuracy": 0.6495495587587357,
"num_tokens": 12261626.0,
"step": 940
},
{
"epoch": 0.60224,
"grad_norm": 3.524958610534668,
"learning_rate": 5e-06,
"loss": 1.5488,
"mean_token_accuracy": 0.6084389686584473,
"num_tokens": 12275245.0,
"step": 941
},
{
"epoch": 0.60288,
"grad_norm": 3.6148650646209717,
"learning_rate": 5e-06,
"loss": 1.3679,
"mean_token_accuracy": 0.6441814675927162,
"num_tokens": 12289591.0,
"step": 942
},
{
"epoch": 0.60352,
"grad_norm": 3.531022071838379,
"learning_rate": 5e-06,
"loss": 1.4175,
"mean_token_accuracy": 0.6326302289962769,
"num_tokens": 12303061.0,
"step": 943
},
{
"epoch": 0.60416,
"grad_norm": 3.5599935054779053,
"learning_rate": 5e-06,
"loss": 1.4981,
"mean_token_accuracy": 0.6270119249820709,
"num_tokens": 12317689.0,
"step": 944
},
{
"epoch": 0.6048,
"grad_norm": 3.125378370285034,
"learning_rate": 5e-06,
"loss": 1.0466,
"mean_token_accuracy": 0.7116389200091362,
"num_tokens": 12332378.0,
"step": 945
},
{
"epoch": 0.60544,
"grad_norm": 3.8127193450927734,
"learning_rate": 5e-06,
"loss": 1.3282,
"mean_token_accuracy": 0.6464278548955917,
"num_tokens": 12345052.0,
"step": 946
},
{
"epoch": 0.60608,
"grad_norm": 3.636815309524536,
"learning_rate": 5e-06,
"loss": 1.4728,
"mean_token_accuracy": 0.6262509748339653,
"num_tokens": 12356621.0,
"step": 947
},
{
"epoch": 0.60672,
"grad_norm": 3.3789074420928955,
"learning_rate": 5e-06,
"loss": 1.2371,
"mean_token_accuracy": 0.6846612468361855,
"num_tokens": 12370824.0,
"step": 948
},
{
"epoch": 0.60736,
"grad_norm": 3.5147576332092285,
"learning_rate": 5e-06,
"loss": 1.2429,
"mean_token_accuracy": 0.6510372906923294,
"num_tokens": 12382166.0,
"step": 949
},
{
"epoch": 0.608,
"grad_norm": 4.723844528198242,
"learning_rate": 5e-06,
"loss": 1.3264,
"mean_token_accuracy": 0.6632048487663269,
"num_tokens": 12394366.0,
"step": 950
},
{
"epoch": 0.60864,
"grad_norm": 3.679612398147583,
"learning_rate": 5e-06,
"loss": 1.3556,
"mean_token_accuracy": 0.6747320145368576,
"num_tokens": 12408326.0,
"step": 951
},
{
"epoch": 0.60928,
"grad_norm": 3.3034772872924805,
"learning_rate": 5e-06,
"loss": 1.5539,
"mean_token_accuracy": 0.6177156269550323,
"num_tokens": 12422379.0,
"step": 952
},
{
"epoch": 0.60992,
"grad_norm": 7.560748100280762,
"learning_rate": 5e-06,
"loss": 1.3543,
"mean_token_accuracy": 0.6603868454694748,
"num_tokens": 12434809.0,
"step": 953
},
{
"epoch": 0.61056,
"grad_norm": 4.265347003936768,
"learning_rate": 5e-06,
"loss": 1.4282,
"mean_token_accuracy": 0.6390318870544434,
"num_tokens": 12447614.0,
"step": 954
},
{
"epoch": 0.6112,
"grad_norm": 3.8850181102752686,
"learning_rate": 5e-06,
"loss": 1.2699,
"mean_token_accuracy": 0.6917356178164482,
"num_tokens": 12459891.0,
"step": 955
},
{
"epoch": 0.61184,
"grad_norm": 3.479156255722046,
"learning_rate": 5e-06,
"loss": 1.1499,
"mean_token_accuracy": 0.6750801056623459,
"num_tokens": 12474836.0,
"step": 956
},
{
"epoch": 0.61248,
"grad_norm": 2.7899651527404785,
"learning_rate": 5e-06,
"loss": 1.2879,
"mean_token_accuracy": 0.6665042042732239,
"num_tokens": 12493558.0,
"step": 957
},
{
"epoch": 0.61312,
"grad_norm": 3.6457180976867676,
"learning_rate": 5e-06,
"loss": 1.2007,
"mean_token_accuracy": 0.6936507746577263,
"num_tokens": 12506849.0,
"step": 958
},
{
"epoch": 0.61376,
"grad_norm": 3.0956859588623047,
"learning_rate": 5e-06,
"loss": 1.4628,
"mean_token_accuracy": 0.6274904161691666,
"num_tokens": 12522171.0,
"step": 959
},
{
"epoch": 0.6144,
"grad_norm": 3.615293264389038,
"learning_rate": 5e-06,
"loss": 1.2786,
"mean_token_accuracy": 0.679816409945488,
"num_tokens": 12537702.0,
"step": 960
},
{
"epoch": 0.61504,
"grad_norm": 3.4518120288848877,
"learning_rate": 5e-06,
"loss": 1.3591,
"mean_token_accuracy": 0.6446092203259468,
"num_tokens": 12550526.0,
"step": 961
},
{
"epoch": 0.61568,
"grad_norm": 3.4621338844299316,
"learning_rate": 5e-06,
"loss": 1.3075,
"mean_token_accuracy": 0.665081262588501,
"num_tokens": 12564227.0,
"step": 962
},
{
"epoch": 0.61632,
"grad_norm": 3.3471479415893555,
"learning_rate": 5e-06,
"loss": 1.4756,
"mean_token_accuracy": 0.6200397908687592,
"num_tokens": 12578106.0,
"step": 963
},
{
"epoch": 0.61696,
"grad_norm": 3.874799966812134,
"learning_rate": 5e-06,
"loss": 1.2777,
"mean_token_accuracy": 0.6613158509135246,
"num_tokens": 12589750.0,
"step": 964
},
{
"epoch": 0.6176,
"grad_norm": 4.006873607635498,
"learning_rate": 5e-06,
"loss": 1.408,
"mean_token_accuracy": 0.6419458091259003,
"num_tokens": 12602450.0,
"step": 965
},
{
"epoch": 0.61824,
"grad_norm": 3.674241542816162,
"learning_rate": 5e-06,
"loss": 1.2756,
"mean_token_accuracy": 0.6584514081478119,
"num_tokens": 12613871.0,
"step": 966
},
{
"epoch": 0.61888,
"grad_norm": 3.7405648231506348,
"learning_rate": 5e-06,
"loss": 1.3301,
"mean_token_accuracy": 0.6810361295938492,
"num_tokens": 12626220.0,
"step": 967
},
{
"epoch": 0.61952,
"grad_norm": 3.660600185394287,
"learning_rate": 5e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.6716256737709045,
"num_tokens": 12636440.0,
"step": 968
},
{
"epoch": 0.62016,
"grad_norm": 3.4270999431610107,
"learning_rate": 5e-06,
"loss": 1.0976,
"mean_token_accuracy": 0.7019147500395775,
"num_tokens": 12649545.0,
"step": 969
},
{
"epoch": 0.6208,
"grad_norm": 3.562014102935791,
"learning_rate": 5e-06,
"loss": 1.2243,
"mean_token_accuracy": 0.6942142397165298,
"num_tokens": 12660841.0,
"step": 970
},
{
"epoch": 0.62144,
"grad_norm": 4.004054069519043,
"learning_rate": 5e-06,
"loss": 1.0961,
"mean_token_accuracy": 0.6962654888629913,
"num_tokens": 12675967.0,
"step": 971
},
{
"epoch": 0.62208,
"grad_norm": 3.749152898788452,
"learning_rate": 5e-06,
"loss": 1.489,
"mean_token_accuracy": 0.6146213822066784,
"num_tokens": 12687203.0,
"step": 972
},
{
"epoch": 0.62272,
"grad_norm": 3.2638871669769287,
"learning_rate": 5e-06,
"loss": 1.1979,
"mean_token_accuracy": 0.687263160943985,
"num_tokens": 12700697.0,
"step": 973
},
{
"epoch": 0.62336,
"grad_norm": 3.310070037841797,
"learning_rate": 5e-06,
"loss": 1.4779,
"mean_token_accuracy": 0.632742814719677,
"num_tokens": 12716668.0,
"step": 974
},
{
"epoch": 0.624,
"grad_norm": 3.3164589405059814,
"learning_rate": 5e-06,
"loss": 1.301,
"mean_token_accuracy": 0.6563373729586601,
"num_tokens": 12729912.0,
"step": 975
},
{
"epoch": 0.62464,
"grad_norm": 3.2415506839752197,
"learning_rate": 5e-06,
"loss": 1.393,
"mean_token_accuracy": 0.644082136452198,
"num_tokens": 12745520.0,
"step": 976
},
{
"epoch": 0.62528,
"grad_norm": 3.333308458328247,
"learning_rate": 5e-06,
"loss": 1.1238,
"mean_token_accuracy": 0.6863394901156425,
"num_tokens": 12759203.0,
"step": 977
},
{
"epoch": 0.62592,
"grad_norm": 4.198854923248291,
"learning_rate": 5e-06,
"loss": 1.3601,
"mean_token_accuracy": 0.6624843999743462,
"num_tokens": 12770322.0,
"step": 978
},
{
"epoch": 0.62656,
"grad_norm": 3.849907636642456,
"learning_rate": 5e-06,
"loss": 1.2947,
"mean_token_accuracy": 0.6675618216395378,
"num_tokens": 12782951.0,
"step": 979
},
{
"epoch": 0.6272,
"grad_norm": 3.4649503231048584,
"learning_rate": 5e-06,
"loss": 1.1915,
"mean_token_accuracy": 0.6806611344218254,
"num_tokens": 12795383.0,
"step": 980
},
{
"epoch": 0.62784,
"grad_norm": 3.63466739654541,
"learning_rate": 5e-06,
"loss": 1.3124,
"mean_token_accuracy": 0.6731822267174721,
"num_tokens": 12808692.0,
"step": 981
},
{
"epoch": 0.62848,
"grad_norm": 4.293845176696777,
"learning_rate": 5e-06,
"loss": 1.1757,
"mean_token_accuracy": 0.6780604794621468,
"num_tokens": 12821099.0,
"step": 982
},
{
"epoch": 0.62912,
"grad_norm": 3.565584897994995,
"learning_rate": 5e-06,
"loss": 1.1746,
"mean_token_accuracy": 0.6787943094968796,
"num_tokens": 12832201.0,
"step": 983
},
{
"epoch": 0.62976,
"grad_norm": 3.517613410949707,
"learning_rate": 5e-06,
"loss": 1.2167,
"mean_token_accuracy": 0.6914558485150337,
"num_tokens": 12845465.0,
"step": 984
},
{
"epoch": 0.6304,
"grad_norm": 3.6170578002929688,
"learning_rate": 5e-06,
"loss": 1.3226,
"mean_token_accuracy": 0.6587705016136169,
"num_tokens": 12857366.0,
"step": 985
},
{
"epoch": 0.63104,
"grad_norm": 3.504154682159424,
"learning_rate": 5e-06,
"loss": 1.4641,
"mean_token_accuracy": 0.6085046976804733,
"num_tokens": 12871695.0,
"step": 986
},
{
"epoch": 0.63168,
"grad_norm": 3.543142557144165,
"learning_rate": 5e-06,
"loss": 1.1252,
"mean_token_accuracy": 0.7007554769515991,
"num_tokens": 12884113.0,
"step": 987
},
{
"epoch": 0.63232,
"grad_norm": 3.9888851642608643,
"learning_rate": 5e-06,
"loss": 1.2741,
"mean_token_accuracy": 0.656329832971096,
"num_tokens": 12898706.0,
"step": 988
},
{
"epoch": 0.63296,
"grad_norm": 3.472778081893921,
"learning_rate": 5e-06,
"loss": 1.2431,
"mean_token_accuracy": 0.6751798540353775,
"num_tokens": 12911380.0,
"step": 989
},
{
"epoch": 0.6336,
"grad_norm": 3.3277764320373535,
"learning_rate": 5e-06,
"loss": 1.475,
"mean_token_accuracy": 0.632897637784481,
"num_tokens": 12925697.0,
"step": 990
},
{
"epoch": 0.63424,
"grad_norm": 3.047473669052124,
"learning_rate": 5e-06,
"loss": 1.2663,
"mean_token_accuracy": 0.6683759167790413,
"num_tokens": 12939473.0,
"step": 991
},
{
"epoch": 0.63488,
"grad_norm": 3.483201503753662,
"learning_rate": 5e-06,
"loss": 1.3407,
"mean_token_accuracy": 0.6652352660894394,
"num_tokens": 12952439.0,
"step": 992
},
{
"epoch": 0.63552,
"grad_norm": 4.43934965133667,
"learning_rate": 5e-06,
"loss": 1.0979,
"mean_token_accuracy": 0.6819510236382484,
"num_tokens": 12963232.0,
"step": 993
},
{
"epoch": 0.63616,
"grad_norm": 3.2107748985290527,
"learning_rate": 5e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.6648012548685074,
"num_tokens": 12976396.0,
"step": 994
},
{
"epoch": 0.6368,
"grad_norm": 3.8679394721984863,
"learning_rate": 5e-06,
"loss": 1.3487,
"mean_token_accuracy": 0.6491437703371048,
"num_tokens": 12989586.0,
"step": 995
},
{
"epoch": 0.63744,
"grad_norm": 3.75811767578125,
"learning_rate": 5e-06,
"loss": 1.2384,
"mean_token_accuracy": 0.6684290617704391,
"num_tokens": 13002145.0,
"step": 996
},
{
"epoch": 0.63808,
"grad_norm": 4.223326206207275,
"learning_rate": 5e-06,
"loss": 1.3218,
"mean_token_accuracy": 0.6605047658085823,
"num_tokens": 13011853.0,
"step": 997
},
{
"epoch": 0.63872,
"grad_norm": 4.10746955871582,
"learning_rate": 5e-06,
"loss": 1.2647,
"mean_token_accuracy": 0.6529572680592537,
"num_tokens": 13022296.0,
"step": 998
},
{
"epoch": 0.63936,
"grad_norm": 3.858157157897949,
"learning_rate": 5e-06,
"loss": 1.3031,
"mean_token_accuracy": 0.6564661860466003,
"num_tokens": 13032768.0,
"step": 999
},
{
"epoch": 0.64,
"grad_norm": 3.4283535480499268,
"learning_rate": 5e-06,
"loss": 1.3122,
"mean_token_accuracy": 0.6764922738075256,
"num_tokens": 13045249.0,
"step": 1000
},
{
"epoch": 0.64064,
"grad_norm": 3.5663790702819824,
"learning_rate": 5e-06,
"loss": 1.3038,
"mean_token_accuracy": 0.6534328386187553,
"num_tokens": 13057087.0,
"step": 1001
},
{
"epoch": 0.64128,
"grad_norm": 4.08723783493042,
"learning_rate": 5e-06,
"loss": 1.311,
"mean_token_accuracy": 0.6497639790177345,
"num_tokens": 13069353.0,
"step": 1002
},
{
"epoch": 0.64192,
"grad_norm": 3.1709539890289307,
"learning_rate": 5e-06,
"loss": 1.4385,
"mean_token_accuracy": 0.6491308063268661,
"num_tokens": 13084511.0,
"step": 1003
},
{
"epoch": 0.64256,
"grad_norm": 3.7724292278289795,
"learning_rate": 5e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.6716037020087242,
"num_tokens": 13095572.0,
"step": 1004
},
{
"epoch": 0.6432,
"grad_norm": 3.4885339736938477,
"learning_rate": 5e-06,
"loss": 1.1454,
"mean_token_accuracy": 0.7053978741168976,
"num_tokens": 13108817.0,
"step": 1005
},
{
"epoch": 0.64384,
"grad_norm": 3.718435287475586,
"learning_rate": 5e-06,
"loss": 1.2653,
"mean_token_accuracy": 0.6529537960886955,
"num_tokens": 13119845.0,
"step": 1006
},
{
"epoch": 0.64448,
"grad_norm": 3.7939629554748535,
"learning_rate": 5e-06,
"loss": 1.2563,
"mean_token_accuracy": 0.6783741563558578,
"num_tokens": 13131590.0,
"step": 1007
},
{
"epoch": 0.64512,
"grad_norm": 3.0090038776397705,
"learning_rate": 5e-06,
"loss": 1.3032,
"mean_token_accuracy": 0.6621121242642403,
"num_tokens": 13146614.0,
"step": 1008
},
{
"epoch": 0.64576,
"grad_norm": 3.3267111778259277,
"learning_rate": 5e-06,
"loss": 1.4166,
"mean_token_accuracy": 0.6354531794786453,
"num_tokens": 13160682.0,
"step": 1009
},
{
"epoch": 0.6464,
"grad_norm": 3.528743267059326,
"learning_rate": 5e-06,
"loss": 1.246,
"mean_token_accuracy": 0.6703604385256767,
"num_tokens": 13172998.0,
"step": 1010
},
{
"epoch": 0.64704,
"grad_norm": 3.2315750122070312,
"learning_rate": 5e-06,
"loss": 1.2159,
"mean_token_accuracy": 0.6840131431818008,
"num_tokens": 13187182.0,
"step": 1011
},
{
"epoch": 0.64768,
"grad_norm": 3.885690689086914,
"learning_rate": 5e-06,
"loss": 1.3193,
"mean_token_accuracy": 0.6602044403553009,
"num_tokens": 13198449.0,
"step": 1012
},
{
"epoch": 0.64832,
"grad_norm": 4.214417934417725,
"learning_rate": 5e-06,
"loss": 1.3289,
"mean_token_accuracy": 0.638420894742012,
"num_tokens": 13208888.0,
"step": 1013
},
{
"epoch": 0.64896,
"grad_norm": 3.303224563598633,
"learning_rate": 5e-06,
"loss": 1.3866,
"mean_token_accuracy": 0.6286184787750244,
"num_tokens": 13222042.0,
"step": 1014
},
{
"epoch": 0.6496,
"grad_norm": 3.879709482192993,
"learning_rate": 5e-06,
"loss": 1.4231,
"mean_token_accuracy": 0.6209117695689201,
"num_tokens": 13234662.0,
"step": 1015
},
{
"epoch": 0.65024,
"grad_norm": 3.770817995071411,
"learning_rate": 5e-06,
"loss": 1.3315,
"mean_token_accuracy": 0.6555488482117653,
"num_tokens": 13245357.0,
"step": 1016
},
{
"epoch": 0.65088,
"grad_norm": 3.627957582473755,
"learning_rate": 5e-06,
"loss": 1.2457,
"mean_token_accuracy": 0.6971615925431252,
"num_tokens": 13258154.0,
"step": 1017
},
{
"epoch": 0.65152,
"grad_norm": 3.818009853363037,
"learning_rate": 5e-06,
"loss": 1.2174,
"mean_token_accuracy": 0.6713634058833122,
"num_tokens": 13270001.0,
"step": 1018
},
{
"epoch": 0.65216,
"grad_norm": 3.7726924419403076,
"learning_rate": 5e-06,
"loss": 1.2554,
"mean_token_accuracy": 0.6766888722777367,
"num_tokens": 13281334.0,
"step": 1019
},
{
"epoch": 0.6528,
"grad_norm": 3.608661413192749,
"learning_rate": 5e-06,
"loss": 1.4083,
"mean_token_accuracy": 0.6521065756678581,
"num_tokens": 13294860.0,
"step": 1020
},
{
"epoch": 0.65344,
"grad_norm": 3.7841391563415527,
"learning_rate": 5e-06,
"loss": 1.4197,
"mean_token_accuracy": 0.6439206749200821,
"num_tokens": 13308758.0,
"step": 1021
},
{
"epoch": 0.65408,
"grad_norm": 3.836831569671631,
"learning_rate": 5e-06,
"loss": 1.3922,
"mean_token_accuracy": 0.6572518870234489,
"num_tokens": 13319883.0,
"step": 1022
},
{
"epoch": 0.65472,
"grad_norm": 3.774944305419922,
"learning_rate": 5e-06,
"loss": 1.1792,
"mean_token_accuracy": 0.6656563580036163,
"num_tokens": 13331846.0,
"step": 1023
},
{
"epoch": 0.65536,
"grad_norm": 4.0701751708984375,
"learning_rate": 5e-06,
"loss": 1.2333,
"mean_token_accuracy": 0.6843068152666092,
"num_tokens": 13343010.0,
"step": 1024
},
{
"epoch": 0.656,
"grad_norm": 3.7170510292053223,
"learning_rate": 5e-06,
"loss": 1.1427,
"mean_token_accuracy": 0.6941706016659737,
"num_tokens": 13356096.0,
"step": 1025
},
{
"epoch": 0.65664,
"grad_norm": 3.4047844409942627,
"learning_rate": 5e-06,
"loss": 1.2994,
"mean_token_accuracy": 0.6502626538276672,
"num_tokens": 13370415.0,
"step": 1026
},
{
"epoch": 0.65728,
"grad_norm": 3.013894557952881,
"learning_rate": 5e-06,
"loss": 1.2778,
"mean_token_accuracy": 0.6713566333055496,
"num_tokens": 13385621.0,
"step": 1027
},
{
"epoch": 0.65792,
"grad_norm": 3.8273723125457764,
"learning_rate": 5e-06,
"loss": 1.4213,
"mean_token_accuracy": 0.6448807269334793,
"num_tokens": 13399913.0,
"step": 1028
},
{
"epoch": 0.65856,
"grad_norm": 4.501821041107178,
"learning_rate": 5e-06,
"loss": 1.3451,
"mean_token_accuracy": 0.6397663801908493,
"num_tokens": 13410323.0,
"step": 1029
},
{
"epoch": 0.6592,
"grad_norm": 3.656630516052246,
"learning_rate": 5e-06,
"loss": 1.3693,
"mean_token_accuracy": 0.6572180986404419,
"num_tokens": 13421431.0,
"step": 1030
},
{
"epoch": 0.65984,
"grad_norm": 3.761538505554199,
"learning_rate": 5e-06,
"loss": 1.3701,
"mean_token_accuracy": 0.6800885275006294,
"num_tokens": 13433250.0,
"step": 1031
},
{
"epoch": 0.66048,
"grad_norm": 3.5799546241760254,
"learning_rate": 5e-06,
"loss": 1.3473,
"mean_token_accuracy": 0.6503010243177414,
"num_tokens": 13446010.0,
"step": 1032
},
{
"epoch": 0.66112,
"grad_norm": 3.578547239303589,
"learning_rate": 5e-06,
"loss": 1.2915,
"mean_token_accuracy": 0.7072044536471367,
"num_tokens": 13458978.0,
"step": 1033
},
{
"epoch": 0.66176,
"grad_norm": 3.554094076156616,
"learning_rate": 5e-06,
"loss": 1.1486,
"mean_token_accuracy": 0.6704866662621498,
"num_tokens": 13471248.0,
"step": 1034
},
{
"epoch": 0.6624,
"grad_norm": 3.5921144485473633,
"learning_rate": 5e-06,
"loss": 1.3039,
"mean_token_accuracy": 0.6752297282218933,
"num_tokens": 13483651.0,
"step": 1035
},
{
"epoch": 0.66304,
"grad_norm": 3.580885648727417,
"learning_rate": 5e-06,
"loss": 1.3638,
"mean_token_accuracy": 0.6531356200575829,
"num_tokens": 13496365.0,
"step": 1036
},
{
"epoch": 0.66368,
"grad_norm": 3.6400530338287354,
"learning_rate": 5e-06,
"loss": 1.2824,
"mean_token_accuracy": 0.6671290174126625,
"num_tokens": 13509196.0,
"step": 1037
},
{
"epoch": 0.66432,
"grad_norm": 3.050649404525757,
"learning_rate": 5e-06,
"loss": 1.2883,
"mean_token_accuracy": 0.6613614112138748,
"num_tokens": 13524592.0,
"step": 1038
},
{
"epoch": 0.66496,
"grad_norm": 3.1810715198516846,
"learning_rate": 5e-06,
"loss": 1.4794,
"mean_token_accuracy": 0.633582279086113,
"num_tokens": 13539709.0,
"step": 1039
},
{
"epoch": 0.6656,
"grad_norm": 3.488229751586914,
"learning_rate": 5e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.6704655885696411,
"num_tokens": 13551656.0,
"step": 1040
},
{
"epoch": 0.66624,
"grad_norm": 3.1657679080963135,
"learning_rate": 5e-06,
"loss": 1.405,
"mean_token_accuracy": 0.6457558646798134,
"num_tokens": 13566681.0,
"step": 1041
},
{
"epoch": 0.66688,
"grad_norm": 3.7111074924468994,
"learning_rate": 5e-06,
"loss": 1.1577,
"mean_token_accuracy": 0.6851859763264656,
"num_tokens": 13578777.0,
"step": 1042
},
{
"epoch": 0.66752,
"grad_norm": 3.803246021270752,
"learning_rate": 5e-06,
"loss": 1.7394,
"mean_token_accuracy": 0.5742352418601513,
"num_tokens": 13591184.0,
"step": 1043
},
{
"epoch": 0.66816,
"grad_norm": 3.44681453704834,
"learning_rate": 5e-06,
"loss": 1.249,
"mean_token_accuracy": 0.6815991401672363,
"num_tokens": 13603011.0,
"step": 1044
},
{
"epoch": 0.6688,
"grad_norm": 3.4363629817962646,
"learning_rate": 5e-06,
"loss": 1.207,
"mean_token_accuracy": 0.6451572626829147,
"num_tokens": 13617577.0,
"step": 1045
},
{
"epoch": 0.66944,
"grad_norm": 3.9714715480804443,
"learning_rate": 5e-06,
"loss": 1.316,
"mean_token_accuracy": 0.6733849868178368,
"num_tokens": 13628510.0,
"step": 1046
},
{
"epoch": 0.67008,
"grad_norm": 3.5095605850219727,
"learning_rate": 5e-06,
"loss": 1.2265,
"mean_token_accuracy": 0.6774598509073257,
"num_tokens": 13639965.0,
"step": 1047
},
{
"epoch": 0.67072,
"grad_norm": 3.4731342792510986,
"learning_rate": 5e-06,
"loss": 1.514,
"mean_token_accuracy": 0.6291368082165718,
"num_tokens": 13652881.0,
"step": 1048
},
{
"epoch": 0.67136,
"grad_norm": 3.4731788635253906,
"learning_rate": 5e-06,
"loss": 1.2134,
"mean_token_accuracy": 0.7064939141273499,
"num_tokens": 13665986.0,
"step": 1049
},
{
"epoch": 0.672,
"grad_norm": 3.885256052017212,
"learning_rate": 5e-06,
"loss": 1.2582,
"mean_token_accuracy": 0.6920784562826157,
"num_tokens": 13677555.0,
"step": 1050
},
{
"epoch": 0.67264,
"grad_norm": 3.5971357822418213,
"learning_rate": 5e-06,
"loss": 1.2803,
"mean_token_accuracy": 0.6860932558774948,
"num_tokens": 13689439.0,
"step": 1051
},
{
"epoch": 0.67328,
"grad_norm": 3.4999284744262695,
"learning_rate": 5e-06,
"loss": 1.2694,
"mean_token_accuracy": 0.6710788905620575,
"num_tokens": 13703306.0,
"step": 1052
},
{
"epoch": 0.67392,
"grad_norm": 3.894716262817383,
"learning_rate": 5e-06,
"loss": 1.1895,
"mean_token_accuracy": 0.694163866341114,
"num_tokens": 13715107.0,
"step": 1053
},
{
"epoch": 0.67456,
"grad_norm": 3.8361921310424805,
"learning_rate": 5e-06,
"loss": 1.3142,
"mean_token_accuracy": 0.6533372104167938,
"num_tokens": 13726324.0,
"step": 1054
},
{
"epoch": 0.6752,
"grad_norm": 3.5220136642456055,
"learning_rate": 5e-06,
"loss": 1.5138,
"mean_token_accuracy": 0.6085778698325157,
"num_tokens": 13739132.0,
"step": 1055
},
{
"epoch": 0.67584,
"grad_norm": 3.4445347785949707,
"learning_rate": 5e-06,
"loss": 1.3869,
"mean_token_accuracy": 0.6502392590045929,
"num_tokens": 13751996.0,
"step": 1056
},
{
"epoch": 0.67648,
"grad_norm": 4.514054298400879,
"learning_rate": 5e-06,
"loss": 1.0082,
"mean_token_accuracy": 0.7111445441842079,
"num_tokens": 13762912.0,
"step": 1057
},
{
"epoch": 0.67712,
"grad_norm": 3.4511091709136963,
"learning_rate": 5e-06,
"loss": 1.2244,
"mean_token_accuracy": 0.6701223999261856,
"num_tokens": 13776094.0,
"step": 1058
},
{
"epoch": 0.67776,
"grad_norm": 3.518554449081421,
"learning_rate": 5e-06,
"loss": 1.4949,
"mean_token_accuracy": 0.6190471276640892,
"num_tokens": 13789681.0,
"step": 1059
},
{
"epoch": 0.6784,
"grad_norm": 3.177955150604248,
"learning_rate": 5e-06,
"loss": 1.4254,
"mean_token_accuracy": 0.6302540749311447,
"num_tokens": 13802777.0,
"step": 1060
},
{
"epoch": 0.67904,
"grad_norm": 3.7214250564575195,
"learning_rate": 5e-06,
"loss": 1.3088,
"mean_token_accuracy": 0.6556456014513969,
"num_tokens": 13815229.0,
"step": 1061
},
{
"epoch": 0.67968,
"grad_norm": 3.726001739501953,
"learning_rate": 5e-06,
"loss": 1.2057,
"mean_token_accuracy": 0.6688976883888245,
"num_tokens": 13826932.0,
"step": 1062
},
{
"epoch": 0.68032,
"grad_norm": 3.1761860847473145,
"learning_rate": 5e-06,
"loss": 1.4054,
"mean_token_accuracy": 0.6317546740174294,
"num_tokens": 13842144.0,
"step": 1063
},
{
"epoch": 0.68096,
"grad_norm": 4.224031448364258,
"learning_rate": 5e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.6879750266671181,
"num_tokens": 13852498.0,
"step": 1064
},
{
"epoch": 0.6816,
"grad_norm": 3.1462998390197754,
"learning_rate": 5e-06,
"loss": 1.3639,
"mean_token_accuracy": 0.6389659121632576,
"num_tokens": 13867842.0,
"step": 1065
},
{
"epoch": 0.68224,
"grad_norm": 3.7994680404663086,
"learning_rate": 5e-06,
"loss": 1.2442,
"mean_token_accuracy": 0.6608950793743134,
"num_tokens": 13878904.0,
"step": 1066
},
{
"epoch": 0.68288,
"grad_norm": 3.3029258251190186,
"learning_rate": 5e-06,
"loss": 1.1501,
"mean_token_accuracy": 0.6978383362293243,
"num_tokens": 13892970.0,
"step": 1067
},
{
"epoch": 0.68352,
"grad_norm": 4.019161224365234,
"learning_rate": 5e-06,
"loss": 1.2156,
"mean_token_accuracy": 0.689938597381115,
"num_tokens": 13904207.0,
"step": 1068
},
{
"epoch": 0.68416,
"grad_norm": 3.9899635314941406,
"learning_rate": 5e-06,
"loss": 1.4744,
"mean_token_accuracy": 0.6247128024697304,
"num_tokens": 13917327.0,
"step": 1069
},
{
"epoch": 0.6848,
"grad_norm": 5.03689432144165,
"learning_rate": 5e-06,
"loss": 1.3551,
"mean_token_accuracy": 0.6756569370627403,
"num_tokens": 13927551.0,
"step": 1070
},
{
"epoch": 0.68544,
"grad_norm": 3.43404221534729,
"learning_rate": 5e-06,
"loss": 1.1618,
"mean_token_accuracy": 0.6679700240492821,
"num_tokens": 13939630.0,
"step": 1071
},
{
"epoch": 0.68608,
"grad_norm": 4.027390956878662,
"learning_rate": 5e-06,
"loss": 1.3842,
"mean_token_accuracy": 0.6630447581410408,
"num_tokens": 13949082.0,
"step": 1072
},
{
"epoch": 0.68672,
"grad_norm": 3.764420986175537,
"learning_rate": 5e-06,
"loss": 1.293,
"mean_token_accuracy": 0.6628079935908318,
"num_tokens": 13962085.0,
"step": 1073
},
{
"epoch": 0.68736,
"grad_norm": 3.617522954940796,
"learning_rate": 5e-06,
"loss": 1.5355,
"mean_token_accuracy": 0.6268560141324997,
"num_tokens": 13973510.0,
"step": 1074
},
{
"epoch": 0.688,
"grad_norm": 3.6434836387634277,
"learning_rate": 5e-06,
"loss": 1.3283,
"mean_token_accuracy": 0.6506749242544174,
"num_tokens": 13986487.0,
"step": 1075
},
{
"epoch": 0.68864,
"grad_norm": 3.4601213932037354,
"learning_rate": 5e-06,
"loss": 1.1938,
"mean_token_accuracy": 0.6761009320616722,
"num_tokens": 13998818.0,
"step": 1076
},
{
"epoch": 0.68928,
"grad_norm": 3.537867307662964,
"learning_rate": 5e-06,
"loss": 1.2904,
"mean_token_accuracy": 0.6788460239768028,
"num_tokens": 14011012.0,
"step": 1077
},
{
"epoch": 0.68992,
"grad_norm": 3.204850435256958,
"learning_rate": 5e-06,
"loss": 1.257,
"mean_token_accuracy": 0.650609090924263,
"num_tokens": 14026133.0,
"step": 1078
},
{
"epoch": 0.69056,
"grad_norm": 3.1684117317199707,
"learning_rate": 5e-06,
"loss": 1.3857,
"mean_token_accuracy": 0.669170081615448,
"num_tokens": 14041463.0,
"step": 1079
},
{
"epoch": 0.6912,
"grad_norm": 2.97310209274292,
"learning_rate": 5e-06,
"loss": 1.1807,
"mean_token_accuracy": 0.678413525223732,
"num_tokens": 14055206.0,
"step": 1080
},
{
"epoch": 0.69184,
"grad_norm": 3.415344476699829,
"learning_rate": 5e-06,
"loss": 1.335,
"mean_token_accuracy": 0.6698315292596817,
"num_tokens": 14067043.0,
"step": 1081
},
{
"epoch": 0.69248,
"grad_norm": 3.2605786323547363,
"learning_rate": 5e-06,
"loss": 1.23,
"mean_token_accuracy": 0.6607236042618752,
"num_tokens": 14081150.0,
"step": 1082
},
{
"epoch": 0.69312,
"grad_norm": 3.5928292274475098,
"learning_rate": 5e-06,
"loss": 1.3275,
"mean_token_accuracy": 0.6353218033909798,
"num_tokens": 14093199.0,
"step": 1083
},
{
"epoch": 0.69376,
"grad_norm": 3.6726202964782715,
"learning_rate": 5e-06,
"loss": 1.2071,
"mean_token_accuracy": 0.6714643463492393,
"num_tokens": 14106118.0,
"step": 1084
},
{
"epoch": 0.6944,
"grad_norm": 3.3273112773895264,
"learning_rate": 5e-06,
"loss": 1.2143,
"mean_token_accuracy": 0.6702926605939865,
"num_tokens": 14119919.0,
"step": 1085
},
{
"epoch": 0.69504,
"grad_norm": 3.3181533813476562,
"learning_rate": 5e-06,
"loss": 1.525,
"mean_token_accuracy": 0.6393495798110962,
"num_tokens": 14134965.0,
"step": 1086
},
{
"epoch": 0.69568,
"grad_norm": 3.290024995803833,
"learning_rate": 5e-06,
"loss": 1.4868,
"mean_token_accuracy": 0.641509011387825,
"num_tokens": 14150873.0,
"step": 1087
},
{
"epoch": 0.69632,
"grad_norm": 3.179009199142456,
"learning_rate": 5e-06,
"loss": 1.3236,
"mean_token_accuracy": 0.6799350008368492,
"num_tokens": 14165423.0,
"step": 1088
},
{
"epoch": 0.69696,
"grad_norm": 4.067260265350342,
"learning_rate": 5e-06,
"loss": 1.3399,
"mean_token_accuracy": 0.6480335146188736,
"num_tokens": 14177467.0,
"step": 1089
},
{
"epoch": 0.6976,
"grad_norm": 3.0903289318084717,
"learning_rate": 5e-06,
"loss": 1.2387,
"mean_token_accuracy": 0.6776180788874626,
"num_tokens": 14192746.0,
"step": 1090
},
{
"epoch": 0.69824,
"grad_norm": 3.60392165184021,
"learning_rate": 5e-06,
"loss": 1.4149,
"mean_token_accuracy": 0.6424715965986252,
"num_tokens": 14205814.0,
"step": 1091
},
{
"epoch": 0.69888,
"grad_norm": 3.857509136199951,
"learning_rate": 5e-06,
"loss": 1.2077,
"mean_token_accuracy": 0.6793344393372536,
"num_tokens": 14216791.0,
"step": 1092
},
{
"epoch": 0.69952,
"grad_norm": 3.376009702682495,
"learning_rate": 5e-06,
"loss": 1.329,
"mean_token_accuracy": 0.6632150262594223,
"num_tokens": 14231190.0,
"step": 1093
},
{
"epoch": 0.70016,
"grad_norm": 3.522667407989502,
"learning_rate": 5e-06,
"loss": 1.3746,
"mean_token_accuracy": 0.6448647379875183,
"num_tokens": 14246266.0,
"step": 1094
},
{
"epoch": 0.7008,
"grad_norm": 3.88810658454895,
"learning_rate": 5e-06,
"loss": 1.4056,
"mean_token_accuracy": 0.6266975551843643,
"num_tokens": 14258900.0,
"step": 1095
},
{
"epoch": 0.70144,
"grad_norm": 4.134660243988037,
"learning_rate": 5e-06,
"loss": 1.4136,
"mean_token_accuracy": 0.6337871551513672,
"num_tokens": 14270135.0,
"step": 1096
},
{
"epoch": 0.70208,
"grad_norm": 2.7987403869628906,
"learning_rate": 5e-06,
"loss": 1.2978,
"mean_token_accuracy": 0.6591611802577972,
"num_tokens": 14287885.0,
"step": 1097
},
{
"epoch": 0.70272,
"grad_norm": 3.6904680728912354,
"learning_rate": 5e-06,
"loss": 1.3421,
"mean_token_accuracy": 0.6668279096484184,
"num_tokens": 14299211.0,
"step": 1098
},
{
"epoch": 0.70336,
"grad_norm": 3.754704475402832,
"learning_rate": 5e-06,
"loss": 1.2728,
"mean_token_accuracy": 0.6614086180925369,
"num_tokens": 14310772.0,
"step": 1099
},
{
"epoch": 0.704,
"grad_norm": 4.1148529052734375,
"learning_rate": 5e-06,
"loss": 1.3038,
"mean_token_accuracy": 0.6619797796010971,
"num_tokens": 14321538.0,
"step": 1100
},
{
"epoch": 0.70464,
"grad_norm": 3.9892449378967285,
"learning_rate": 5e-06,
"loss": 1.4351,
"mean_token_accuracy": 0.6713762879371643,
"num_tokens": 14332186.0,
"step": 1101
},
{
"epoch": 0.70528,
"grad_norm": 2.8868937492370605,
"learning_rate": 5e-06,
"loss": 1.3939,
"mean_token_accuracy": 0.6400659307837486,
"num_tokens": 14349043.0,
"step": 1102
},
{
"epoch": 0.70592,
"grad_norm": 3.4299302101135254,
"learning_rate": 5e-06,
"loss": 1.2674,
"mean_token_accuracy": 0.6643095165491104,
"num_tokens": 14363765.0,
"step": 1103
},
{
"epoch": 0.70656,
"grad_norm": 3.3706107139587402,
"learning_rate": 5e-06,
"loss": 1.3314,
"mean_token_accuracy": 0.6502428278326988,
"num_tokens": 14377935.0,
"step": 1104
},
{
"epoch": 0.7072,
"grad_norm": 3.353766441345215,
"learning_rate": 5e-06,
"loss": 1.4449,
"mean_token_accuracy": 0.6395809948444366,
"num_tokens": 14391497.0,
"step": 1105
},
{
"epoch": 0.70784,
"grad_norm": 3.7346043586730957,
"learning_rate": 5e-06,
"loss": 1.2955,
"mean_token_accuracy": 0.6504843458533287,
"num_tokens": 14403120.0,
"step": 1106
},
{
"epoch": 0.70848,
"grad_norm": 3.9729044437408447,
"learning_rate": 5e-06,
"loss": 1.2647,
"mean_token_accuracy": 0.6647339016199112,
"num_tokens": 14415923.0,
"step": 1107
},
{
"epoch": 0.70912,
"grad_norm": 4.029970169067383,
"learning_rate": 5e-06,
"loss": 1.227,
"mean_token_accuracy": 0.6661604270339012,
"num_tokens": 14427554.0,
"step": 1108
},
{
"epoch": 0.70976,
"grad_norm": 3.4321465492248535,
"learning_rate": 5e-06,
"loss": 1.1746,
"mean_token_accuracy": 0.6878796294331551,
"num_tokens": 14441354.0,
"step": 1109
},
{
"epoch": 0.7104,
"grad_norm": 3.303091287612915,
"learning_rate": 5e-06,
"loss": 1.3507,
"mean_token_accuracy": 0.6451763212680817,
"num_tokens": 14455989.0,
"step": 1110
},
{
"epoch": 0.71104,
"grad_norm": 3.9641027450561523,
"learning_rate": 5e-06,
"loss": 1.287,
"mean_token_accuracy": 0.661915197968483,
"num_tokens": 14466700.0,
"step": 1111
},
{
"epoch": 0.71168,
"grad_norm": 3.4277381896972656,
"learning_rate": 5e-06,
"loss": 1.1418,
"mean_token_accuracy": 0.6885363236069679,
"num_tokens": 14479735.0,
"step": 1112
},
{
"epoch": 0.71232,
"grad_norm": 3.531708240509033,
"learning_rate": 5e-06,
"loss": 1.1786,
"mean_token_accuracy": 0.6798023506999016,
"num_tokens": 14492428.0,
"step": 1113
},
{
"epoch": 0.71296,
"grad_norm": 3.962233304977417,
"learning_rate": 5e-06,
"loss": 1.5488,
"mean_token_accuracy": 0.6183573752641678,
"num_tokens": 14505050.0,
"step": 1114
},
{
"epoch": 0.7136,
"grad_norm": 3.1472697257995605,
"learning_rate": 5e-06,
"loss": 1.426,
"mean_token_accuracy": 0.6550877764821053,
"num_tokens": 14519128.0,
"step": 1115
},
{
"epoch": 0.71424,
"grad_norm": 3.8537216186523438,
"learning_rate": 5e-06,
"loss": 1.1991,
"mean_token_accuracy": 0.682333379983902,
"num_tokens": 14530594.0,
"step": 1116
},
{
"epoch": 0.71488,
"grad_norm": 3.527343511581421,
"learning_rate": 5e-06,
"loss": 1.207,
"mean_token_accuracy": 0.6722685918211937,
"num_tokens": 14541472.0,
"step": 1117
},
{
"epoch": 0.71552,
"grad_norm": 3.790855646133423,
"learning_rate": 5e-06,
"loss": 1.2531,
"mean_token_accuracy": 0.6708436533808708,
"num_tokens": 14552813.0,
"step": 1118
},
{
"epoch": 0.71616,
"grad_norm": 3.553488254547119,
"learning_rate": 5e-06,
"loss": 1.1016,
"mean_token_accuracy": 0.7148231789469719,
"num_tokens": 14565447.0,
"step": 1119
},
{
"epoch": 0.7168,
"grad_norm": 3.887118339538574,
"learning_rate": 5e-06,
"loss": 1.1646,
"mean_token_accuracy": 0.6907912865281105,
"num_tokens": 14577377.0,
"step": 1120
},
{
"epoch": 0.71744,
"grad_norm": 3.0343868732452393,
"learning_rate": 5e-06,
"loss": 1.142,
"mean_token_accuracy": 0.6866175085306168,
"num_tokens": 14591768.0,
"step": 1121
},
{
"epoch": 0.71808,
"grad_norm": 4.561229705810547,
"learning_rate": 5e-06,
"loss": 1.4004,
"mean_token_accuracy": 0.6330222748219967,
"num_tokens": 14603193.0,
"step": 1122
},
{
"epoch": 0.71872,
"grad_norm": 3.5638325214385986,
"learning_rate": 5e-06,
"loss": 1.1526,
"mean_token_accuracy": 0.6880695223808289,
"num_tokens": 14617007.0,
"step": 1123
},
{
"epoch": 0.71936,
"grad_norm": 3.810415267944336,
"learning_rate": 5e-06,
"loss": 1.279,
"mean_token_accuracy": 0.670023150742054,
"num_tokens": 14629048.0,
"step": 1124
},
{
"epoch": 0.72,
"grad_norm": 4.179751396179199,
"learning_rate": 5e-06,
"loss": 1.0399,
"mean_token_accuracy": 0.7168305143713951,
"num_tokens": 14639669.0,
"step": 1125
},
{
"epoch": 0.72064,
"grad_norm": 3.539612054824829,
"learning_rate": 5e-06,
"loss": 1.3306,
"mean_token_accuracy": 0.6881062537431717,
"num_tokens": 14652435.0,
"step": 1126
},
{
"epoch": 0.72128,
"grad_norm": 3.597693681716919,
"learning_rate": 5e-06,
"loss": 1.3782,
"mean_token_accuracy": 0.6435956582427025,
"num_tokens": 14664804.0,
"step": 1127
},
{
"epoch": 0.72192,
"grad_norm": 3.3020715713500977,
"learning_rate": 5e-06,
"loss": 1.3158,
"mean_token_accuracy": 0.6422711089253426,
"num_tokens": 14679481.0,
"step": 1128
},
{
"epoch": 0.72256,
"grad_norm": 3.4006054401397705,
"learning_rate": 5e-06,
"loss": 1.4268,
"mean_token_accuracy": 0.6340702697634697,
"num_tokens": 14693009.0,
"step": 1129
},
{
"epoch": 0.7232,
"grad_norm": 3.6534066200256348,
"learning_rate": 5e-06,
"loss": 1.3942,
"mean_token_accuracy": 0.6490144804120064,
"num_tokens": 14704415.0,
"step": 1130
},
{
"epoch": 0.72384,
"grad_norm": 4.022477149963379,
"learning_rate": 5e-06,
"loss": 1.4425,
"mean_token_accuracy": 0.6391843035817146,
"num_tokens": 14718972.0,
"step": 1131
},
{
"epoch": 0.72448,
"grad_norm": 3.717512369155884,
"learning_rate": 5e-06,
"loss": 1.337,
"mean_token_accuracy": 0.6694408059120178,
"num_tokens": 14731864.0,
"step": 1132
},
{
"epoch": 0.72512,
"grad_norm": 3.640937566757202,
"learning_rate": 5e-06,
"loss": 1.2358,
"mean_token_accuracy": 0.6641776859760284,
"num_tokens": 14743467.0,
"step": 1133
},
{
"epoch": 0.72576,
"grad_norm": 3.5870702266693115,
"learning_rate": 5e-06,
"loss": 1.2264,
"mean_token_accuracy": 0.6706337183713913,
"num_tokens": 14755081.0,
"step": 1134
},
{
"epoch": 0.7264,
"grad_norm": 3.6272132396698,
"learning_rate": 5e-06,
"loss": 1.3878,
"mean_token_accuracy": 0.6861530616879463,
"num_tokens": 14766730.0,
"step": 1135
},
{
"epoch": 0.72704,
"grad_norm": 3.349130392074585,
"learning_rate": 5e-06,
"loss": 1.428,
"mean_token_accuracy": 0.6495333984494209,
"num_tokens": 14780158.0,
"step": 1136
},
{
"epoch": 0.72768,
"grad_norm": 3.8108246326446533,
"learning_rate": 5e-06,
"loss": 1.2034,
"mean_token_accuracy": 0.6748805195093155,
"num_tokens": 14793884.0,
"step": 1137
},
{
"epoch": 0.72832,
"grad_norm": 3.4483556747436523,
"learning_rate": 5e-06,
"loss": 1.3949,
"mean_token_accuracy": 0.6391731649637222,
"num_tokens": 14806967.0,
"step": 1138
},
{
"epoch": 0.72896,
"grad_norm": 3.3666470050811768,
"learning_rate": 5e-06,
"loss": 1.2185,
"mean_token_accuracy": 0.6691764742136002,
"num_tokens": 14820303.0,
"step": 1139
},
{
"epoch": 0.7296,
"grad_norm": 3.32536244392395,
"learning_rate": 5e-06,
"loss": 1.2849,
"mean_token_accuracy": 0.6658271849155426,
"num_tokens": 14834025.0,
"step": 1140
},
{
"epoch": 0.73024,
"grad_norm": 3.825983762741089,
"learning_rate": 5e-06,
"loss": 1.6181,
"mean_token_accuracy": 0.602071076631546,
"num_tokens": 14847478.0,
"step": 1141
},
{
"epoch": 0.73088,
"grad_norm": 4.397375106811523,
"learning_rate": 5e-06,
"loss": 1.3697,
"mean_token_accuracy": 0.6505918800830841,
"num_tokens": 14859179.0,
"step": 1142
},
{
"epoch": 0.73152,
"grad_norm": 4.159323215484619,
"learning_rate": 5e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.7010925114154816,
"num_tokens": 14869522.0,
"step": 1143
},
{
"epoch": 0.73216,
"grad_norm": 3.4876530170440674,
"learning_rate": 5e-06,
"loss": 1.245,
"mean_token_accuracy": 0.652951605618,
"num_tokens": 14883372.0,
"step": 1144
},
{
"epoch": 0.7328,
"grad_norm": 3.0746846199035645,
"learning_rate": 5e-06,
"loss": 1.5279,
"mean_token_accuracy": 0.6031446754932404,
"num_tokens": 14899033.0,
"step": 1145
},
{
"epoch": 0.73344,
"grad_norm": 3.7521297931671143,
"learning_rate": 5e-06,
"loss": 1.186,
"mean_token_accuracy": 0.6688085421919823,
"num_tokens": 14913040.0,
"step": 1146
},
{
"epoch": 0.73408,
"grad_norm": 3.9737706184387207,
"learning_rate": 5e-06,
"loss": 1.314,
"mean_token_accuracy": 0.6808184832334518,
"num_tokens": 14927324.0,
"step": 1147
},
{
"epoch": 0.73472,
"grad_norm": 3.6961631774902344,
"learning_rate": 5e-06,
"loss": 1.264,
"mean_token_accuracy": 0.6656776443123817,
"num_tokens": 14938251.0,
"step": 1148
},
{
"epoch": 0.73536,
"grad_norm": 4.080604553222656,
"learning_rate": 5e-06,
"loss": 1.4443,
"mean_token_accuracy": 0.6355870217084885,
"num_tokens": 14948864.0,
"step": 1149
},
{
"epoch": 0.736,
"grad_norm": 3.284268617630005,
"learning_rate": 5e-06,
"loss": 1.1416,
"mean_token_accuracy": 0.6982963308691978,
"num_tokens": 14962968.0,
"step": 1150
},
{
"epoch": 0.73664,
"grad_norm": 3.623760223388672,
"learning_rate": 5e-06,
"loss": 1.206,
"mean_token_accuracy": 0.6715554222464561,
"num_tokens": 14974492.0,
"step": 1151
},
{
"epoch": 0.73728,
"grad_norm": 3.6222383975982666,
"learning_rate": 5e-06,
"loss": 1.2002,
"mean_token_accuracy": 0.665322557091713,
"num_tokens": 14987080.0,
"step": 1152
},
{
"epoch": 0.73792,
"grad_norm": 4.134393692016602,
"learning_rate": 5e-06,
"loss": 1.3446,
"mean_token_accuracy": 0.6732967086136341,
"num_tokens": 14997376.0,
"step": 1153
},
{
"epoch": 0.73856,
"grad_norm": 3.1004269123077393,
"learning_rate": 5e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.6673456728458405,
"num_tokens": 15011696.0,
"step": 1154
},
{
"epoch": 0.7392,
"grad_norm": 3.437642812728882,
"learning_rate": 5e-06,
"loss": 1.3756,
"mean_token_accuracy": 0.6303885355591774,
"num_tokens": 15026552.0,
"step": 1155
},
{
"epoch": 0.73984,
"grad_norm": 8.039863586425781,
"learning_rate": 5e-06,
"loss": 1.2034,
"mean_token_accuracy": 0.7016506418585777,
"num_tokens": 15038554.0,
"step": 1156
},
{
"epoch": 0.74048,
"grad_norm": 3.248920440673828,
"learning_rate": 5e-06,
"loss": 1.2926,
"mean_token_accuracy": 0.6631775796413422,
"num_tokens": 15054408.0,
"step": 1157
},
{
"epoch": 0.74112,
"grad_norm": 3.959541082382202,
"learning_rate": 5e-06,
"loss": 1.45,
"mean_token_accuracy": 0.6382048651576042,
"num_tokens": 15065684.0,
"step": 1158
},
{
"epoch": 0.74176,
"grad_norm": 4.347902297973633,
"learning_rate": 5e-06,
"loss": 1.0298,
"mean_token_accuracy": 0.7234738394618034,
"num_tokens": 15077041.0,
"step": 1159
},
{
"epoch": 0.7424,
"grad_norm": 4.224346160888672,
"learning_rate": 5e-06,
"loss": 1.272,
"mean_token_accuracy": 0.6782936900854111,
"num_tokens": 15088347.0,
"step": 1160
},
{
"epoch": 0.74304,
"grad_norm": 3.770258903503418,
"learning_rate": 5e-06,
"loss": 1.3465,
"mean_token_accuracy": 0.661977045238018,
"num_tokens": 15101460.0,
"step": 1161
},
{
"epoch": 0.74368,
"grad_norm": 3.7153191566467285,
"learning_rate": 5e-06,
"loss": 1.3216,
"mean_token_accuracy": 0.6487752310931683,
"num_tokens": 15115486.0,
"step": 1162
},
{
"epoch": 0.74432,
"grad_norm": 4.508492469787598,
"learning_rate": 5e-06,
"loss": 1.2035,
"mean_token_accuracy": 0.6841593757271767,
"num_tokens": 15125039.0,
"step": 1163
},
{
"epoch": 0.74496,
"grad_norm": 3.0245108604431152,
"learning_rate": 5e-06,
"loss": 1.3148,
"mean_token_accuracy": 0.6497530564665794,
"num_tokens": 15142007.0,
"step": 1164
},
{
"epoch": 0.7456,
"grad_norm": 3.7130560874938965,
"learning_rate": 5e-06,
"loss": 1.365,
"mean_token_accuracy": 0.6439146772027016,
"num_tokens": 15154070.0,
"step": 1165
},
{
"epoch": 0.74624,
"grad_norm": 4.014090538024902,
"learning_rate": 5e-06,
"loss": 1.5601,
"mean_token_accuracy": 0.6075103767216206,
"num_tokens": 15165816.0,
"step": 1166
},
{
"epoch": 0.74688,
"grad_norm": 3.5442097187042236,
"learning_rate": 5e-06,
"loss": 1.2896,
"mean_token_accuracy": 0.6592826843261719,
"num_tokens": 15180213.0,
"step": 1167
},
{
"epoch": 0.74752,
"grad_norm": 3.3585143089294434,
"learning_rate": 5e-06,
"loss": 1.3492,
"mean_token_accuracy": 0.650349847972393,
"num_tokens": 15195358.0,
"step": 1168
},
{
"epoch": 0.74816,
"grad_norm": 3.3249661922454834,
"learning_rate": 5e-06,
"loss": 1.2987,
"mean_token_accuracy": 0.6541887670755386,
"num_tokens": 15208187.0,
"step": 1169
},
{
"epoch": 0.7488,
"grad_norm": 3.2732949256896973,
"learning_rate": 5e-06,
"loss": 1.4745,
"mean_token_accuracy": 0.6346693634986877,
"num_tokens": 15224302.0,
"step": 1170
},
{
"epoch": 0.74944,
"grad_norm": 3.717664957046509,
"learning_rate": 5e-06,
"loss": 1.2258,
"mean_token_accuracy": 0.6592052280902863,
"num_tokens": 15235541.0,
"step": 1171
},
{
"epoch": 0.75008,
"grad_norm": 3.3119561672210693,
"learning_rate": 5e-06,
"loss": 1.2281,
"mean_token_accuracy": 0.6707274541258812,
"num_tokens": 15249155.0,
"step": 1172
},
{
"epoch": 0.75072,
"grad_norm": 3.4180824756622314,
"learning_rate": 5e-06,
"loss": 1.2323,
"mean_token_accuracy": 0.6759630665183067,
"num_tokens": 15262745.0,
"step": 1173
},
{
"epoch": 0.75136,
"grad_norm": 3.351557970046997,
"learning_rate": 5e-06,
"loss": 1.2131,
"mean_token_accuracy": 0.6812815740704536,
"num_tokens": 15276413.0,
"step": 1174
},
{
"epoch": 0.752,
"grad_norm": 4.228631973266602,
"learning_rate": 5e-06,
"loss": 1.1811,
"mean_token_accuracy": 0.694750115275383,
"num_tokens": 15287419.0,
"step": 1175
},
{
"epoch": 0.75264,
"grad_norm": 3.346228837966919,
"learning_rate": 5e-06,
"loss": 1.4694,
"mean_token_accuracy": 0.6221407428383827,
"num_tokens": 15304245.0,
"step": 1176
},
{
"epoch": 0.75328,
"grad_norm": 3.899305582046509,
"learning_rate": 5e-06,
"loss": 1.3843,
"mean_token_accuracy": 0.6404564082622528,
"num_tokens": 15315843.0,
"step": 1177
},
{
"epoch": 0.75392,
"grad_norm": 3.3452677726745605,
"learning_rate": 5e-06,
"loss": 1.4037,
"mean_token_accuracy": 0.6525731533765793,
"num_tokens": 15330084.0,
"step": 1178
},
{
"epoch": 0.75456,
"grad_norm": 3.4091222286224365,
"learning_rate": 5e-06,
"loss": 1.4005,
"mean_token_accuracy": 0.6623276621103287,
"num_tokens": 15342406.0,
"step": 1179
},
{
"epoch": 0.7552,
"grad_norm": 3.5373282432556152,
"learning_rate": 5e-06,
"loss": 1.1613,
"mean_token_accuracy": 0.6908905506134033,
"num_tokens": 15355306.0,
"step": 1180
},
{
"epoch": 0.75584,
"grad_norm": 3.9077682495117188,
"learning_rate": 5e-06,
"loss": 1.2934,
"mean_token_accuracy": 0.6559719070792198,
"num_tokens": 15365562.0,
"step": 1181
},
{
"epoch": 0.75648,
"grad_norm": 4.251070022583008,
"learning_rate": 5e-06,
"loss": 1.1809,
"mean_token_accuracy": 0.7090832963585854,
"num_tokens": 15377910.0,
"step": 1182
},
{
"epoch": 0.75712,
"grad_norm": 3.6916239261627197,
"learning_rate": 5e-06,
"loss": 1.1891,
"mean_token_accuracy": 0.6642716750502586,
"num_tokens": 15390158.0,
"step": 1183
},
{
"epoch": 0.75776,
"grad_norm": 3.235966682434082,
"learning_rate": 5e-06,
"loss": 1.3514,
"mean_token_accuracy": 0.654154047369957,
"num_tokens": 15405763.0,
"step": 1184
},
{
"epoch": 0.7584,
"grad_norm": 3.0988378524780273,
"learning_rate": 5e-06,
"loss": 1.3606,
"mean_token_accuracy": 0.6405491232872009,
"num_tokens": 15421326.0,
"step": 1185
},
{
"epoch": 0.75904,
"grad_norm": 3.5612781047821045,
"learning_rate": 5e-06,
"loss": 1.3463,
"mean_token_accuracy": 0.6501871645450592,
"num_tokens": 15434530.0,
"step": 1186
},
{
"epoch": 0.75968,
"grad_norm": 3.6004257202148438,
"learning_rate": 5e-06,
"loss": 1.346,
"mean_token_accuracy": 0.6661404147744179,
"num_tokens": 15448462.0,
"step": 1187
},
{
"epoch": 0.76032,
"grad_norm": 4.093327045440674,
"learning_rate": 5e-06,
"loss": 1.257,
"mean_token_accuracy": 0.6833517551422119,
"num_tokens": 15460521.0,
"step": 1188
},
{
"epoch": 0.76096,
"grad_norm": 3.7774133682250977,
"learning_rate": 5e-06,
"loss": 1.4023,
"mean_token_accuracy": 0.6524300873279572,
"num_tokens": 15472815.0,
"step": 1189
},
{
"epoch": 0.7616,
"grad_norm": 3.2685515880584717,
"learning_rate": 5e-06,
"loss": 1.1868,
"mean_token_accuracy": 0.6791387870907784,
"num_tokens": 15488314.0,
"step": 1190
},
{
"epoch": 0.76224,
"grad_norm": 3.4335551261901855,
"learning_rate": 5e-06,
"loss": 1.4461,
"mean_token_accuracy": 0.6327428966760635,
"num_tokens": 15502345.0,
"step": 1191
},
{
"epoch": 0.76288,
"grad_norm": 3.3318638801574707,
"learning_rate": 5e-06,
"loss": 1.4262,
"mean_token_accuracy": 0.632860004901886,
"num_tokens": 15518148.0,
"step": 1192
},
{
"epoch": 0.76352,
"grad_norm": 3.1482911109924316,
"learning_rate": 5e-06,
"loss": 1.2723,
"mean_token_accuracy": 0.696734681725502,
"num_tokens": 15532567.0,
"step": 1193
},
{
"epoch": 0.76416,
"grad_norm": 4.470282554626465,
"learning_rate": 5e-06,
"loss": 1.4906,
"mean_token_accuracy": 0.6341715455055237,
"num_tokens": 15544296.0,
"step": 1194
},
{
"epoch": 0.7648,
"grad_norm": 3.548245429992676,
"learning_rate": 5e-06,
"loss": 1.4285,
"mean_token_accuracy": 0.628907784819603,
"num_tokens": 15556178.0,
"step": 1195
},
{
"epoch": 0.76544,
"grad_norm": 3.0455758571624756,
"learning_rate": 5e-06,
"loss": 1.2669,
"mean_token_accuracy": 0.6598797962069511,
"num_tokens": 15570402.0,
"step": 1196
},
{
"epoch": 0.76608,
"grad_norm": 3.394630193710327,
"learning_rate": 5e-06,
"loss": 1.3457,
"mean_token_accuracy": 0.6525625661015511,
"num_tokens": 15583933.0,
"step": 1197
},
{
"epoch": 0.76672,
"grad_norm": 3.572402238845825,
"learning_rate": 5e-06,
"loss": 1.1829,
"mean_token_accuracy": 0.6823357492685318,
"num_tokens": 15596838.0,
"step": 1198
},
{
"epoch": 0.76736,
"grad_norm": 4.091769695281982,
"learning_rate": 5e-06,
"loss": 1.3601,
"mean_token_accuracy": 0.6605678722262383,
"num_tokens": 15609200.0,
"step": 1199
},
{
"epoch": 0.768,
"grad_norm": 3.402550220489502,
"learning_rate": 5e-06,
"loss": 1.2345,
"mean_token_accuracy": 0.6593906283378601,
"num_tokens": 15623053.0,
"step": 1200
},
{
"epoch": 0.76864,
"grad_norm": 3.7215263843536377,
"learning_rate": 5e-06,
"loss": 1.3201,
"mean_token_accuracy": 0.6674540042877197,
"num_tokens": 15635540.0,
"step": 1201
},
{
"epoch": 0.76928,
"grad_norm": 3.5162336826324463,
"learning_rate": 5e-06,
"loss": 1.3041,
"mean_token_accuracy": 0.662396639585495,
"num_tokens": 15648152.0,
"step": 1202
},
{
"epoch": 0.76992,
"grad_norm": 3.8758740425109863,
"learning_rate": 5e-06,
"loss": 1.2048,
"mean_token_accuracy": 0.6618586331605911,
"num_tokens": 15659825.0,
"step": 1203
},
{
"epoch": 0.77056,
"grad_norm": 3.6302740573883057,
"learning_rate": 5e-06,
"loss": 1.463,
"mean_token_accuracy": 0.6243670582771301,
"num_tokens": 15675357.0,
"step": 1204
},
{
"epoch": 0.7712,
"grad_norm": 3.250278949737549,
"learning_rate": 5e-06,
"loss": 1.3677,
"mean_token_accuracy": 0.6358913704752922,
"num_tokens": 15690114.0,
"step": 1205
},
{
"epoch": 0.77184,
"grad_norm": 3.5102968215942383,
"learning_rate": 5e-06,
"loss": 1.1182,
"mean_token_accuracy": 0.6968755125999451,
"num_tokens": 15704497.0,
"step": 1206
},
{
"epoch": 0.77248,
"grad_norm": 3.386099100112915,
"learning_rate": 5e-06,
"loss": 1.1454,
"mean_token_accuracy": 0.6858478710055351,
"num_tokens": 15718228.0,
"step": 1207
},
{
"epoch": 0.77312,
"grad_norm": 3.6120481491088867,
"learning_rate": 5e-06,
"loss": 1.3312,
"mean_token_accuracy": 0.669069878757,
"num_tokens": 15731116.0,
"step": 1208
},
{
"epoch": 0.77376,
"grad_norm": 3.7133243083953857,
"learning_rate": 5e-06,
"loss": 1.5032,
"mean_token_accuracy": 0.6273391470313072,
"num_tokens": 15743678.0,
"step": 1209
},
{
"epoch": 0.7744,
"grad_norm": 3.4095213413238525,
"learning_rate": 5e-06,
"loss": 1.3583,
"mean_token_accuracy": 0.6577341482043266,
"num_tokens": 15757250.0,
"step": 1210
},
{
"epoch": 0.77504,
"grad_norm": 4.357828140258789,
"learning_rate": 5e-06,
"loss": 1.4194,
"mean_token_accuracy": 0.6646361202001572,
"num_tokens": 15767568.0,
"step": 1211
},
{
"epoch": 0.77568,
"grad_norm": 3.3669044971466064,
"learning_rate": 5e-06,
"loss": 1.3806,
"mean_token_accuracy": 0.6653162240982056,
"num_tokens": 15781457.0,
"step": 1212
},
{
"epoch": 0.77632,
"grad_norm": 3.057096004486084,
"learning_rate": 5e-06,
"loss": 1.2735,
"mean_token_accuracy": 0.6667918264865875,
"num_tokens": 15796444.0,
"step": 1213
},
{
"epoch": 0.77696,
"grad_norm": 3.549315929412842,
"learning_rate": 5e-06,
"loss": 1.255,
"mean_token_accuracy": 0.668325200676918,
"num_tokens": 15807909.0,
"step": 1214
},
{
"epoch": 0.7776,
"grad_norm": 4.293363571166992,
"learning_rate": 5e-06,
"loss": 1.2011,
"mean_token_accuracy": 0.7006052732467651,
"num_tokens": 15818410.0,
"step": 1215
},
{
"epoch": 0.77824,
"grad_norm": 3.4453113079071045,
"learning_rate": 5e-06,
"loss": 1.4502,
"mean_token_accuracy": 0.6456183791160583,
"num_tokens": 15830410.0,
"step": 1216
},
{
"epoch": 0.77888,
"grad_norm": 3.340660572052002,
"learning_rate": 5e-06,
"loss": 1.3797,
"mean_token_accuracy": 0.6471363380551338,
"num_tokens": 15843270.0,
"step": 1217
},
{
"epoch": 0.77952,
"grad_norm": 3.578989267349243,
"learning_rate": 5e-06,
"loss": 1.3165,
"mean_token_accuracy": 0.6591121554374695,
"num_tokens": 15856513.0,
"step": 1218
},
{
"epoch": 0.78016,
"grad_norm": 3.311697483062744,
"learning_rate": 5e-06,
"loss": 1.3268,
"mean_token_accuracy": 0.6524678990244865,
"num_tokens": 15869453.0,
"step": 1219
},
{
"epoch": 0.7808,
"grad_norm": 3.2292022705078125,
"learning_rate": 5e-06,
"loss": 1.2939,
"mean_token_accuracy": 0.6603180393576622,
"num_tokens": 15884284.0,
"step": 1220
},
{
"epoch": 0.78144,
"grad_norm": 3.189804792404175,
"learning_rate": 5e-06,
"loss": 1.3883,
"mean_token_accuracy": 0.6416840329766273,
"num_tokens": 15898664.0,
"step": 1221
},
{
"epoch": 0.78208,
"grad_norm": 3.1236817836761475,
"learning_rate": 5e-06,
"loss": 1.3237,
"mean_token_accuracy": 0.6408574059605598,
"num_tokens": 15913125.0,
"step": 1222
},
{
"epoch": 0.78272,
"grad_norm": 4.161830902099609,
"learning_rate": 5e-06,
"loss": 1.4025,
"mean_token_accuracy": 0.630896121263504,
"num_tokens": 15925782.0,
"step": 1223
},
{
"epoch": 0.78336,
"grad_norm": 3.626995086669922,
"learning_rate": 5e-06,
"loss": 1.087,
"mean_token_accuracy": 0.6901156529784203,
"num_tokens": 15936307.0,
"step": 1224
},
{
"epoch": 0.784,
"grad_norm": 3.5811476707458496,
"learning_rate": 5e-06,
"loss": 1.1955,
"mean_token_accuracy": 0.6784915700554848,
"num_tokens": 15947469.0,
"step": 1225
},
{
"epoch": 0.78464,
"grad_norm": 3.4900920391082764,
"learning_rate": 5e-06,
"loss": 1.3206,
"mean_token_accuracy": 0.6642494723200798,
"num_tokens": 15961802.0,
"step": 1226
},
{
"epoch": 0.78528,
"grad_norm": 3.295171022415161,
"learning_rate": 5e-06,
"loss": 1.366,
"mean_token_accuracy": 0.6388103812932968,
"num_tokens": 15973542.0,
"step": 1227
},
{
"epoch": 0.78592,
"grad_norm": 3.179863214492798,
"learning_rate": 5e-06,
"loss": 1.4789,
"mean_token_accuracy": 0.6259790062904358,
"num_tokens": 15987918.0,
"step": 1228
},
{
"epoch": 0.78656,
"grad_norm": 3.5669660568237305,
"learning_rate": 5e-06,
"loss": 1.3716,
"mean_token_accuracy": 0.6410401687026024,
"num_tokens": 16000309.0,
"step": 1229
},
{
"epoch": 0.7872,
"grad_norm": 3.2992517948150635,
"learning_rate": 5e-06,
"loss": 1.3798,
"mean_token_accuracy": 0.6566397473216057,
"num_tokens": 16014417.0,
"step": 1230
},
{
"epoch": 0.78784,
"grad_norm": 3.6735100746154785,
"learning_rate": 5e-06,
"loss": 1.2311,
"mean_token_accuracy": 0.6738264411687851,
"num_tokens": 16026169.0,
"step": 1231
},
{
"epoch": 0.78848,
"grad_norm": 4.013977527618408,
"learning_rate": 5e-06,
"loss": 1.3946,
"mean_token_accuracy": 0.6297749131917953,
"num_tokens": 16036396.0,
"step": 1232
},
{
"epoch": 0.78912,
"grad_norm": 3.506371259689331,
"learning_rate": 5e-06,
"loss": 1.3798,
"mean_token_accuracy": 0.6475069150328636,
"num_tokens": 16049573.0,
"step": 1233
},
{
"epoch": 0.78976,
"grad_norm": 3.0766477584838867,
"learning_rate": 5e-06,
"loss": 1.4281,
"mean_token_accuracy": 0.6643402278423309,
"num_tokens": 16064639.0,
"step": 1234
},
{
"epoch": 0.7904,
"grad_norm": 3.5113558769226074,
"learning_rate": 5e-06,
"loss": 1.1854,
"mean_token_accuracy": 0.6849528402090073,
"num_tokens": 16078167.0,
"step": 1235
},
{
"epoch": 0.79104,
"grad_norm": 3.223271369934082,
"learning_rate": 5e-06,
"loss": 1.3133,
"mean_token_accuracy": 0.6554304733872414,
"num_tokens": 16093693.0,
"step": 1236
},
{
"epoch": 0.79168,
"grad_norm": 3.661078691482544,
"learning_rate": 5e-06,
"loss": 1.2121,
"mean_token_accuracy": 0.7052098885178566,
"num_tokens": 16105008.0,
"step": 1237
},
{
"epoch": 0.79232,
"grad_norm": 3.4575560092926025,
"learning_rate": 5e-06,
"loss": 1.3498,
"mean_token_accuracy": 0.6544990688562393,
"num_tokens": 16117846.0,
"step": 1238
},
{
"epoch": 0.79296,
"grad_norm": 3.559100866317749,
"learning_rate": 5e-06,
"loss": 1.4116,
"mean_token_accuracy": 0.6313577368855476,
"num_tokens": 16130981.0,
"step": 1239
},
{
"epoch": 0.7936,
"grad_norm": 3.2983896732330322,
"learning_rate": 5e-06,
"loss": 1.3647,
"mean_token_accuracy": 0.6640519946813583,
"num_tokens": 16144847.0,
"step": 1240
},
{
"epoch": 0.79424,
"grad_norm": 3.622084856033325,
"learning_rate": 5e-06,
"loss": 1.292,
"mean_token_accuracy": 0.6699836328625679,
"num_tokens": 16156779.0,
"step": 1241
},
{
"epoch": 0.79488,
"grad_norm": 4.421840190887451,
"learning_rate": 5e-06,
"loss": 1.2792,
"mean_token_accuracy": 0.673715990036726,
"num_tokens": 16166792.0,
"step": 1242
},
{
"epoch": 0.79552,
"grad_norm": 3.312913656234741,
"learning_rate": 5e-06,
"loss": 1.4633,
"mean_token_accuracy": 0.6244674026966095,
"num_tokens": 16181336.0,
"step": 1243
},
{
"epoch": 0.79616,
"grad_norm": 3.5397815704345703,
"learning_rate": 5e-06,
"loss": 1.4338,
"mean_token_accuracy": 0.6298167407512665,
"num_tokens": 16194684.0,
"step": 1244
},
{
"epoch": 0.7968,
"grad_norm": 3.798386335372925,
"learning_rate": 5e-06,
"loss": 1.3008,
"mean_token_accuracy": 0.6577273011207581,
"num_tokens": 16206010.0,
"step": 1245
},
{
"epoch": 0.79744,
"grad_norm": 3.379908561706543,
"learning_rate": 5e-06,
"loss": 1.0665,
"mean_token_accuracy": 0.6839143261313438,
"num_tokens": 16220119.0,
"step": 1246
},
{
"epoch": 0.79808,
"grad_norm": 3.7385215759277344,
"learning_rate": 5e-06,
"loss": 1.2292,
"mean_token_accuracy": 0.6770885214209557,
"num_tokens": 16230676.0,
"step": 1247
},
{
"epoch": 0.79872,
"grad_norm": 3.6756489276885986,
"learning_rate": 5e-06,
"loss": 1.2106,
"mean_token_accuracy": 0.6802457198500633,
"num_tokens": 16243393.0,
"step": 1248
},
{
"epoch": 0.79936,
"grad_norm": 3.861645221710205,
"learning_rate": 5e-06,
"loss": 1.2558,
"mean_token_accuracy": 0.6762053146958351,
"num_tokens": 16254097.0,
"step": 1249
},
{
"epoch": 0.8,
"grad_norm": 3.3169620037078857,
"learning_rate": 5e-06,
"loss": 1.2539,
"mean_token_accuracy": 0.6607455164194107,
"num_tokens": 16268379.0,
"step": 1250
},
{
"epoch": 0.80064,
"grad_norm": 3.2894480228424072,
"learning_rate": 5e-06,
"loss": 1.3166,
"mean_token_accuracy": 0.6549070253968239,
"num_tokens": 16283564.0,
"step": 1251
},
{
"epoch": 0.80128,
"grad_norm": 3.8048436641693115,
"learning_rate": 5e-06,
"loss": 1.424,
"mean_token_accuracy": 0.6509182900190353,
"num_tokens": 16295386.0,
"step": 1252
},
{
"epoch": 0.80192,
"grad_norm": 3.7577552795410156,
"learning_rate": 5e-06,
"loss": 1.1979,
"mean_token_accuracy": 0.6759278625249863,
"num_tokens": 16309119.0,
"step": 1253
},
{
"epoch": 0.80256,
"grad_norm": 3.8013439178466797,
"learning_rate": 5e-06,
"loss": 1.3405,
"mean_token_accuracy": 0.668271005153656,
"num_tokens": 16320088.0,
"step": 1254
},
{
"epoch": 0.8032,
"grad_norm": 3.75661039352417,
"learning_rate": 5e-06,
"loss": 1.2675,
"mean_token_accuracy": 0.6828467771410942,
"num_tokens": 16332206.0,
"step": 1255
},
{
"epoch": 0.80384,
"grad_norm": 4.377762794494629,
"learning_rate": 5e-06,
"loss": 1.49,
"mean_token_accuracy": 0.6417308822274208,
"num_tokens": 16344154.0,
"step": 1256
},
{
"epoch": 0.80448,
"grad_norm": 3.524298906326294,
"learning_rate": 5e-06,
"loss": 1.2527,
"mean_token_accuracy": 0.6651709750294685,
"num_tokens": 16357771.0,
"step": 1257
},
{
"epoch": 0.80512,
"grad_norm": 3.6572201251983643,
"learning_rate": 5e-06,
"loss": 1.3117,
"mean_token_accuracy": 0.6568779051303864,
"num_tokens": 16369715.0,
"step": 1258
},
{
"epoch": 0.80576,
"grad_norm": 3.557985305786133,
"learning_rate": 5e-06,
"loss": 1.178,
"mean_token_accuracy": 0.688338540494442,
"num_tokens": 16381110.0,
"step": 1259
},
{
"epoch": 0.8064,
"grad_norm": 3.9126033782958984,
"learning_rate": 5e-06,
"loss": 1.5385,
"mean_token_accuracy": 0.6207233294844627,
"num_tokens": 16393612.0,
"step": 1260
},
{
"epoch": 0.80704,
"grad_norm": 3.5483007431030273,
"learning_rate": 5e-06,
"loss": 1.1773,
"mean_token_accuracy": 0.6948479861021042,
"num_tokens": 16406331.0,
"step": 1261
},
{
"epoch": 0.80768,
"grad_norm": 3.6159143447875977,
"learning_rate": 5e-06,
"loss": 1.3108,
"mean_token_accuracy": 0.6626102104783058,
"num_tokens": 16418794.0,
"step": 1262
},
{
"epoch": 0.80832,
"grad_norm": 3.201352834701538,
"learning_rate": 5e-06,
"loss": 1.346,
"mean_token_accuracy": 0.6304316557943821,
"num_tokens": 16429613.0,
"step": 1263
},
{
"epoch": 0.80896,
"grad_norm": 3.9572861194610596,
"learning_rate": 5e-06,
"loss": 1.0437,
"mean_token_accuracy": 0.7080657631158829,
"num_tokens": 16440987.0,
"step": 1264
},
{
"epoch": 0.8096,
"grad_norm": 3.182184934616089,
"learning_rate": 5e-06,
"loss": 1.356,
"mean_token_accuracy": 0.6579003632068634,
"num_tokens": 16455570.0,
"step": 1265
},
{
"epoch": 0.81024,
"grad_norm": 3.835308313369751,
"learning_rate": 5e-06,
"loss": 1.4172,
"mean_token_accuracy": 0.6239579617977142,
"num_tokens": 16468080.0,
"step": 1266
},
{
"epoch": 0.81088,
"grad_norm": 3.3559696674346924,
"learning_rate": 5e-06,
"loss": 1.1735,
"mean_token_accuracy": 0.6860703229904175,
"num_tokens": 16481207.0,
"step": 1267
},
{
"epoch": 0.81152,
"grad_norm": 3.19657039642334,
"learning_rate": 5e-06,
"loss": 1.3224,
"mean_token_accuracy": 0.6673007681965828,
"num_tokens": 16495219.0,
"step": 1268
},
{
"epoch": 0.81216,
"grad_norm": 3.2514398097991943,
"learning_rate": 5e-06,
"loss": 1.4182,
"mean_token_accuracy": 0.6229546442627907,
"num_tokens": 16511488.0,
"step": 1269
},
{
"epoch": 0.8128,
"grad_norm": 2.9578235149383545,
"learning_rate": 5e-06,
"loss": 1.0946,
"mean_token_accuracy": 0.6994348987936974,
"num_tokens": 16527874.0,
"step": 1270
},
{
"epoch": 0.81344,
"grad_norm": 3.202214479446411,
"learning_rate": 5e-06,
"loss": 1.4316,
"mean_token_accuracy": 0.625508576631546,
"num_tokens": 16542263.0,
"step": 1271
},
{
"epoch": 0.81408,
"grad_norm": 3.9414408206939697,
"learning_rate": 5e-06,
"loss": 1.2243,
"mean_token_accuracy": 0.6666690483689308,
"num_tokens": 16554216.0,
"step": 1272
},
{
"epoch": 0.81472,
"grad_norm": 3.792768955230713,
"learning_rate": 5e-06,
"loss": 1.2563,
"mean_token_accuracy": 0.6469622924923897,
"num_tokens": 16566251.0,
"step": 1273
},
{
"epoch": 0.81536,
"grad_norm": 3.4059951305389404,
"learning_rate": 5e-06,
"loss": 1.3999,
"mean_token_accuracy": 0.6422073394060135,
"num_tokens": 16579765.0,
"step": 1274
},
{
"epoch": 0.816,
"grad_norm": 4.562513828277588,
"learning_rate": 5e-06,
"loss": 1.2946,
"mean_token_accuracy": 0.6693150997161865,
"num_tokens": 16589352.0,
"step": 1275
},
{
"epoch": 0.81664,
"grad_norm": 4.269272327423096,
"learning_rate": 5e-06,
"loss": 1.2899,
"mean_token_accuracy": 0.6878708451986313,
"num_tokens": 16598911.0,
"step": 1276
},
{
"epoch": 0.81728,
"grad_norm": 3.5766615867614746,
"learning_rate": 5e-06,
"loss": 1.3643,
"mean_token_accuracy": 0.6596589758992195,
"num_tokens": 16612073.0,
"step": 1277
},
{
"epoch": 0.81792,
"grad_norm": 3.2693169116973877,
"learning_rate": 5e-06,
"loss": 1.2538,
"mean_token_accuracy": 0.6582971885800362,
"num_tokens": 16626673.0,
"step": 1278
},
{
"epoch": 0.81856,
"grad_norm": 3.7346718311309814,
"learning_rate": 5e-06,
"loss": 1.4059,
"mean_token_accuracy": 0.6605831310153008,
"num_tokens": 16640222.0,
"step": 1279
},
{
"epoch": 0.8192,
"grad_norm": 3.571347951889038,
"learning_rate": 5e-06,
"loss": 1.378,
"mean_token_accuracy": 0.6396291702985764,
"num_tokens": 16652331.0,
"step": 1280
},
{
"epoch": 0.81984,
"grad_norm": 3.3202948570251465,
"learning_rate": 5e-06,
"loss": 1.3002,
"mean_token_accuracy": 0.656744010746479,
"num_tokens": 16664098.0,
"step": 1281
},
{
"epoch": 0.82048,
"grad_norm": 3.2276108264923096,
"learning_rate": 5e-06,
"loss": 1.1796,
"mean_token_accuracy": 0.6782404407858849,
"num_tokens": 16678834.0,
"step": 1282
},
{
"epoch": 0.82112,
"grad_norm": 3.5021538734436035,
"learning_rate": 5e-06,
"loss": 1.1757,
"mean_token_accuracy": 0.6985038220882416,
"num_tokens": 16692514.0,
"step": 1283
},
{
"epoch": 0.82176,
"grad_norm": 3.8361024856567383,
"learning_rate": 5e-06,
"loss": 1.3879,
"mean_token_accuracy": 0.6511978656053543,
"num_tokens": 16705296.0,
"step": 1284
},
{
"epoch": 0.8224,
"grad_norm": 3.3450541496276855,
"learning_rate": 5e-06,
"loss": 1.4618,
"mean_token_accuracy": 0.6236701160669327,
"num_tokens": 16719506.0,
"step": 1285
},
{
"epoch": 0.82304,
"grad_norm": 3.344872236251831,
"learning_rate": 5e-06,
"loss": 1.4341,
"mean_token_accuracy": 0.6452220380306244,
"num_tokens": 16733788.0,
"step": 1286
},
{
"epoch": 0.82368,
"grad_norm": 3.2765679359436035,
"learning_rate": 5e-06,
"loss": 1.3945,
"mean_token_accuracy": 0.6469878405332565,
"num_tokens": 16746762.0,
"step": 1287
},
{
"epoch": 0.82432,
"grad_norm": 3.3606464862823486,
"learning_rate": 5e-06,
"loss": 1.442,
"mean_token_accuracy": 0.6300052553415298,
"num_tokens": 16762035.0,
"step": 1288
},
{
"epoch": 0.82496,
"grad_norm": 3.9703168869018555,
"learning_rate": 5e-06,
"loss": 1.4146,
"mean_token_accuracy": 0.6354392319917679,
"num_tokens": 16772696.0,
"step": 1289
},
{
"epoch": 0.8256,
"grad_norm": 3.2966363430023193,
"learning_rate": 5e-06,
"loss": 1.2722,
"mean_token_accuracy": 0.665034607052803,
"num_tokens": 16787285.0,
"step": 1290
},
{
"epoch": 0.82624,
"grad_norm": 3.6354568004608154,
"learning_rate": 5e-06,
"loss": 1.2903,
"mean_token_accuracy": 0.6690637767314911,
"num_tokens": 16799868.0,
"step": 1291
},
{
"epoch": 0.82688,
"grad_norm": 3.9511008262634277,
"learning_rate": 5e-06,
"loss": 1.3668,
"mean_token_accuracy": 0.6623431816697121,
"num_tokens": 16811306.0,
"step": 1292
},
{
"epoch": 0.82752,
"grad_norm": 3.4990999698638916,
"learning_rate": 5e-06,
"loss": 1.2118,
"mean_token_accuracy": 0.6844175234436989,
"num_tokens": 16824295.0,
"step": 1293
},
{
"epoch": 0.82816,
"grad_norm": 3.638296604156494,
"learning_rate": 5e-06,
"loss": 1.1873,
"mean_token_accuracy": 0.6708608791232109,
"num_tokens": 16836129.0,
"step": 1294
},
{
"epoch": 0.8288,
"grad_norm": 3.5374062061309814,
"learning_rate": 5e-06,
"loss": 1.3716,
"mean_token_accuracy": 0.662922739982605,
"num_tokens": 16849257.0,
"step": 1295
},
{
"epoch": 0.82944,
"grad_norm": 4.183645725250244,
"learning_rate": 5e-06,
"loss": 1.2535,
"mean_token_accuracy": 0.6610106378793716,
"num_tokens": 16860223.0,
"step": 1296
},
{
"epoch": 0.83008,
"grad_norm": 3.551673412322998,
"learning_rate": 5e-06,
"loss": 1.2743,
"mean_token_accuracy": 0.6478192396461964,
"num_tokens": 16871987.0,
"step": 1297
},
{
"epoch": 0.83072,
"grad_norm": 3.2299296855926514,
"learning_rate": 5e-06,
"loss": 1.3783,
"mean_token_accuracy": 0.6627595871686935,
"num_tokens": 16886179.0,
"step": 1298
},
{
"epoch": 0.83136,
"grad_norm": 3.688389301300049,
"learning_rate": 5e-06,
"loss": 1.0686,
"mean_token_accuracy": 0.7124327570199966,
"num_tokens": 16898088.0,
"step": 1299
},
{
"epoch": 0.832,
"grad_norm": 3.371751070022583,
"learning_rate": 5e-06,
"loss": 1.4761,
"mean_token_accuracy": 0.624034658074379,
"num_tokens": 16912488.0,
"step": 1300
},
{
"epoch": 0.83264,
"grad_norm": 3.6259591579437256,
"learning_rate": 5e-06,
"loss": 1.1865,
"mean_token_accuracy": 0.6735802069306374,
"num_tokens": 16926127.0,
"step": 1301
},
{
"epoch": 0.83328,
"grad_norm": 3.571916103363037,
"learning_rate": 5e-06,
"loss": 1.5566,
"mean_token_accuracy": 0.62827018648386,
"num_tokens": 16939816.0,
"step": 1302
},
{
"epoch": 0.83392,
"grad_norm": 3.3074350357055664,
"learning_rate": 5e-06,
"loss": 1.4043,
"mean_token_accuracy": 0.639069065451622,
"num_tokens": 16953696.0,
"step": 1303
},
{
"epoch": 0.83456,
"grad_norm": 3.573622941970825,
"learning_rate": 5e-06,
"loss": 1.3567,
"mean_token_accuracy": 0.6488766446709633,
"num_tokens": 16965974.0,
"step": 1304
},
{
"epoch": 0.8352,
"grad_norm": 3.201739549636841,
"learning_rate": 5e-06,
"loss": 1.2488,
"mean_token_accuracy": 0.6712930873036385,
"num_tokens": 16980031.0,
"step": 1305
},
{
"epoch": 0.83584,
"grad_norm": 3.284263849258423,
"learning_rate": 5e-06,
"loss": 1.3163,
"mean_token_accuracy": 0.6636942848563194,
"num_tokens": 16993264.0,
"step": 1306
},
{
"epoch": 0.83648,
"grad_norm": 3.39267897605896,
"learning_rate": 5e-06,
"loss": 1.2675,
"mean_token_accuracy": 0.6717317998409271,
"num_tokens": 17005939.0,
"step": 1307
},
{
"epoch": 0.83712,
"grad_norm": 3.601962089538574,
"learning_rate": 5e-06,
"loss": 1.2444,
"mean_token_accuracy": 0.7010955587029457,
"num_tokens": 17019858.0,
"step": 1308
},
{
"epoch": 0.83776,
"grad_norm": 4.25007438659668,
"learning_rate": 5e-06,
"loss": 1.2578,
"mean_token_accuracy": 0.6884395852684975,
"num_tokens": 17031840.0,
"step": 1309
},
{
"epoch": 0.8384,
"grad_norm": 3.216642379760742,
"learning_rate": 5e-06,
"loss": 1.07,
"mean_token_accuracy": 0.6811397597193718,
"num_tokens": 17043624.0,
"step": 1310
},
{
"epoch": 0.83904,
"grad_norm": 4.06812858581543,
"learning_rate": 5e-06,
"loss": 1.2633,
"mean_token_accuracy": 0.6581188440322876,
"num_tokens": 17055221.0,
"step": 1311
},
{
"epoch": 0.83968,
"grad_norm": 4.409648418426514,
"learning_rate": 5e-06,
"loss": 1.4064,
"mean_token_accuracy": 0.6504970565438271,
"num_tokens": 17065224.0,
"step": 1312
},
{
"epoch": 0.84032,
"grad_norm": 3.070948839187622,
"learning_rate": 5e-06,
"loss": 1.3761,
"mean_token_accuracy": 0.6479237154126167,
"num_tokens": 17078405.0,
"step": 1313
},
{
"epoch": 0.84096,
"grad_norm": 3.568082094192505,
"learning_rate": 5e-06,
"loss": 1.0255,
"mean_token_accuracy": 0.6967073529958725,
"num_tokens": 17091614.0,
"step": 1314
},
{
"epoch": 0.8416,
"grad_norm": 3.664025068283081,
"learning_rate": 5e-06,
"loss": 1.4398,
"mean_token_accuracy": 0.6788545474410057,
"num_tokens": 17104109.0,
"step": 1315
},
{
"epoch": 0.84224,
"grad_norm": 3.4449939727783203,
"learning_rate": 5e-06,
"loss": 1.203,
"mean_token_accuracy": 0.6785411536693573,
"num_tokens": 17116517.0,
"step": 1316
},
{
"epoch": 0.84288,
"grad_norm": 3.2764899730682373,
"learning_rate": 5e-06,
"loss": 1.2928,
"mean_token_accuracy": 0.6439727321267128,
"num_tokens": 17130912.0,
"step": 1317
},
{
"epoch": 0.84352,
"grad_norm": 3.6440088748931885,
"learning_rate": 5e-06,
"loss": 1.076,
"mean_token_accuracy": 0.7138596475124359,
"num_tokens": 17143603.0,
"step": 1318
},
{
"epoch": 0.84416,
"grad_norm": 3.7815802097320557,
"learning_rate": 5e-06,
"loss": 1.4247,
"mean_token_accuracy": 0.6309964135289192,
"num_tokens": 17156597.0,
"step": 1319
},
{
"epoch": 0.8448,
"grad_norm": 3.145379066467285,
"learning_rate": 5e-06,
"loss": 1.0981,
"mean_token_accuracy": 0.7020114660263062,
"num_tokens": 17170210.0,
"step": 1320
},
{
"epoch": 0.84544,
"grad_norm": 4.029253005981445,
"learning_rate": 5e-06,
"loss": 1.4513,
"mean_token_accuracy": 0.6537614092230797,
"num_tokens": 17182328.0,
"step": 1321
},
{
"epoch": 0.84608,
"grad_norm": 3.2656235694885254,
"learning_rate": 5e-06,
"loss": 1.5357,
"mean_token_accuracy": 0.641093410551548,
"num_tokens": 17197005.0,
"step": 1322
},
{
"epoch": 0.84672,
"grad_norm": 3.559967041015625,
"learning_rate": 5e-06,
"loss": 1.0718,
"mean_token_accuracy": 0.7045318782329559,
"num_tokens": 17208973.0,
"step": 1323
},
{
"epoch": 0.84736,
"grad_norm": 3.366745710372925,
"learning_rate": 5e-06,
"loss": 1.3679,
"mean_token_accuracy": 0.6683920547366142,
"num_tokens": 17221909.0,
"step": 1324
},
{
"epoch": 0.848,
"grad_norm": 3.4706954956054688,
"learning_rate": 5e-06,
"loss": 1.317,
"mean_token_accuracy": 0.6562648341059685,
"num_tokens": 17234739.0,
"step": 1325
},
{
"epoch": 0.84864,
"grad_norm": 3.4657156467437744,
"learning_rate": 5e-06,
"loss": 1.4667,
"mean_token_accuracy": 0.6328883245587349,
"num_tokens": 17249245.0,
"step": 1326
},
{
"epoch": 0.84928,
"grad_norm": 3.4521939754486084,
"learning_rate": 5e-06,
"loss": 1.4047,
"mean_token_accuracy": 0.6429140567779541,
"num_tokens": 17263466.0,
"step": 1327
},
{
"epoch": 0.84992,
"grad_norm": 3.3580243587493896,
"learning_rate": 5e-06,
"loss": 1.4,
"mean_token_accuracy": 0.6485451236367226,
"num_tokens": 17277966.0,
"step": 1328
},
{
"epoch": 0.85056,
"grad_norm": 3.6181726455688477,
"learning_rate": 5e-06,
"loss": 1.4906,
"mean_token_accuracy": 0.6203976050019264,
"num_tokens": 17290080.0,
"step": 1329
},
{
"epoch": 0.8512,
"grad_norm": 3.0654401779174805,
"learning_rate": 5e-06,
"loss": 1.3708,
"mean_token_accuracy": 0.6423755809664726,
"num_tokens": 17307462.0,
"step": 1330
},
{
"epoch": 0.85184,
"grad_norm": 3.682450294494629,
"learning_rate": 5e-06,
"loss": 1.4412,
"mean_token_accuracy": 0.6563334167003632,
"num_tokens": 17320760.0,
"step": 1331
},
{
"epoch": 0.85248,
"grad_norm": 4.22981071472168,
"learning_rate": 5e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.687714472413063,
"num_tokens": 17330799.0,
"step": 1332
},
{
"epoch": 0.85312,
"grad_norm": 3.9495580196380615,
"learning_rate": 5e-06,
"loss": 1.385,
"mean_token_accuracy": 0.643461637198925,
"num_tokens": 17340826.0,
"step": 1333
},
{
"epoch": 0.85376,
"grad_norm": 3.5318918228149414,
"learning_rate": 5e-06,
"loss": 1.2977,
"mean_token_accuracy": 0.6616112142801285,
"num_tokens": 17353805.0,
"step": 1334
},
{
"epoch": 0.8544,
"grad_norm": 3.967776298522949,
"learning_rate": 5e-06,
"loss": 1.2952,
"mean_token_accuracy": 0.6506235525012016,
"num_tokens": 17366394.0,
"step": 1335
},
{
"epoch": 0.85504,
"grad_norm": 3.663810968399048,
"learning_rate": 5e-06,
"loss": 1.148,
"mean_token_accuracy": 0.6877422258257866,
"num_tokens": 17377970.0,
"step": 1336
},
{
"epoch": 0.85568,
"grad_norm": 3.229074478149414,
"learning_rate": 5e-06,
"loss": 1.2273,
"mean_token_accuracy": 0.6863315925002098,
"num_tokens": 17392020.0,
"step": 1337
},
{
"epoch": 0.85632,
"grad_norm": 3.3477957248687744,
"learning_rate": 5e-06,
"loss": 1.054,
"mean_token_accuracy": 0.7150156199932098,
"num_tokens": 17404119.0,
"step": 1338
},
{
"epoch": 0.85696,
"grad_norm": 3.4252710342407227,
"learning_rate": 5e-06,
"loss": 1.4177,
"mean_token_accuracy": 0.6546554416418076,
"num_tokens": 17418373.0,
"step": 1339
},
{
"epoch": 0.8576,
"grad_norm": 3.3960907459259033,
"learning_rate": 5e-06,
"loss": 1.2424,
"mean_token_accuracy": 0.6713818609714508,
"num_tokens": 17430650.0,
"step": 1340
},
{
"epoch": 0.85824,
"grad_norm": 3.5569021701812744,
"learning_rate": 5e-06,
"loss": 1.4048,
"mean_token_accuracy": 0.6662162095308304,
"num_tokens": 17443979.0,
"step": 1341
},
{
"epoch": 0.85888,
"grad_norm": 3.508941650390625,
"learning_rate": 5e-06,
"loss": 1.552,
"mean_token_accuracy": 0.6020488813519478,
"num_tokens": 17458181.0,
"step": 1342
},
{
"epoch": 0.85952,
"grad_norm": 3.9543237686157227,
"learning_rate": 5e-06,
"loss": 1.5179,
"mean_token_accuracy": 0.6510177925229073,
"num_tokens": 17469923.0,
"step": 1343
},
{
"epoch": 0.86016,
"grad_norm": 4.113687515258789,
"learning_rate": 5e-06,
"loss": 1.3311,
"mean_token_accuracy": 0.652983695268631,
"num_tokens": 17483696.0,
"step": 1344
},
{
"epoch": 0.8608,
"grad_norm": 3.756329298019409,
"learning_rate": 5e-06,
"loss": 1.2371,
"mean_token_accuracy": 0.6460439562797546,
"num_tokens": 17496524.0,
"step": 1345
},
{
"epoch": 0.86144,
"grad_norm": 3.375931978225708,
"learning_rate": 5e-06,
"loss": 1.3934,
"mean_token_accuracy": 0.6481117159128189,
"num_tokens": 17510332.0,
"step": 1346
},
{
"epoch": 0.86208,
"grad_norm": 4.059141635894775,
"learning_rate": 5e-06,
"loss": 1.0956,
"mean_token_accuracy": 0.6906588524580002,
"num_tokens": 17520886.0,
"step": 1347
},
{
"epoch": 0.86272,
"grad_norm": 2.9917287826538086,
"learning_rate": 5e-06,
"loss": 1.2105,
"mean_token_accuracy": 0.6634574681520462,
"num_tokens": 17536190.0,
"step": 1348
},
{
"epoch": 0.86336,
"grad_norm": 3.9010698795318604,
"learning_rate": 5e-06,
"loss": 1.2638,
"mean_token_accuracy": 0.6710032075643539,
"num_tokens": 17548427.0,
"step": 1349
},
{
"epoch": 0.864,
"grad_norm": 3.535780668258667,
"learning_rate": 5e-06,
"loss": 1.2962,
"mean_token_accuracy": 0.6431876122951508,
"num_tokens": 17561319.0,
"step": 1350
},
{
"epoch": 0.86464,
"grad_norm": 3.2573955059051514,
"learning_rate": 5e-06,
"loss": 1.233,
"mean_token_accuracy": 0.6907457932829857,
"num_tokens": 17573944.0,
"step": 1351
},
{
"epoch": 0.86528,
"grad_norm": 3.478487491607666,
"learning_rate": 5e-06,
"loss": 1.1014,
"mean_token_accuracy": 0.6931867897510529,
"num_tokens": 17587912.0,
"step": 1352
},
{
"epoch": 0.86592,
"grad_norm": 3.618330955505371,
"learning_rate": 5e-06,
"loss": 1.4492,
"mean_token_accuracy": 0.6480180844664574,
"num_tokens": 17600364.0,
"step": 1353
},
{
"epoch": 0.86656,
"grad_norm": 3.834172248840332,
"learning_rate": 5e-06,
"loss": 1.4564,
"mean_token_accuracy": 0.6302541047334671,
"num_tokens": 17614033.0,
"step": 1354
},
{
"epoch": 0.8672,
"grad_norm": 3.973057746887207,
"learning_rate": 5e-06,
"loss": 1.4296,
"mean_token_accuracy": 0.6398394256830215,
"num_tokens": 17626618.0,
"step": 1355
},
{
"epoch": 0.86784,
"grad_norm": 3.6730847358703613,
"learning_rate": 5e-06,
"loss": 1.3343,
"mean_token_accuracy": 0.6507444530725479,
"num_tokens": 17638206.0,
"step": 1356
},
{
"epoch": 0.86848,
"grad_norm": 3.6375482082366943,
"learning_rate": 5e-06,
"loss": 1.3747,
"mean_token_accuracy": 0.6530400216579437,
"num_tokens": 17650041.0,
"step": 1357
},
{
"epoch": 0.86912,
"grad_norm": 3.4408140182495117,
"learning_rate": 5e-06,
"loss": 1.1361,
"mean_token_accuracy": 0.6785493567585945,
"num_tokens": 17661144.0,
"step": 1358
},
{
"epoch": 0.86976,
"grad_norm": 3.449578046798706,
"learning_rate": 5e-06,
"loss": 1.3709,
"mean_token_accuracy": 0.6438928842544556,
"num_tokens": 17674539.0,
"step": 1359
},
{
"epoch": 0.8704,
"grad_norm": 5.356245994567871,
"learning_rate": 5e-06,
"loss": 1.4153,
"mean_token_accuracy": 0.6556727215647697,
"num_tokens": 17685582.0,
"step": 1360
},
{
"epoch": 0.87104,
"grad_norm": 3.2209205627441406,
"learning_rate": 5e-06,
"loss": 1.3495,
"mean_token_accuracy": 0.6475684642791748,
"num_tokens": 17700534.0,
"step": 1361
},
{
"epoch": 0.87168,
"grad_norm": 4.095639705657959,
"learning_rate": 5e-06,
"loss": 1.3171,
"mean_token_accuracy": 0.6621948033571243,
"num_tokens": 17712854.0,
"step": 1362
},
{
"epoch": 0.87232,
"grad_norm": 4.265082359313965,
"learning_rate": 5e-06,
"loss": 1.3075,
"mean_token_accuracy": 0.6697151511907578,
"num_tokens": 17723003.0,
"step": 1363
},
{
"epoch": 0.87296,
"grad_norm": 3.368932008743286,
"learning_rate": 5e-06,
"loss": 1.4089,
"mean_token_accuracy": 0.6478553786873817,
"num_tokens": 17736730.0,
"step": 1364
},
{
"epoch": 0.8736,
"grad_norm": 3.5103371143341064,
"learning_rate": 5e-06,
"loss": 1.1961,
"mean_token_accuracy": 0.6800166815519333,
"num_tokens": 17749195.0,
"step": 1365
},
{
"epoch": 0.87424,
"grad_norm": 3.6628217697143555,
"learning_rate": 5e-06,
"loss": 1.2206,
"mean_token_accuracy": 0.7043485268950462,
"num_tokens": 17761716.0,
"step": 1366
},
{
"epoch": 0.87488,
"grad_norm": 3.283897638320923,
"learning_rate": 5e-06,
"loss": 1.2755,
"mean_token_accuracy": 0.6614806577563286,
"num_tokens": 17776711.0,
"step": 1367
},
{
"epoch": 0.87552,
"grad_norm": 4.253682613372803,
"learning_rate": 5e-06,
"loss": 1.1304,
"mean_token_accuracy": 0.7110883370041847,
"num_tokens": 17787618.0,
"step": 1368
},
{
"epoch": 0.87616,
"grad_norm": 3.7107419967651367,
"learning_rate": 5e-06,
"loss": 1.353,
"mean_token_accuracy": 0.6722783967852592,
"num_tokens": 17798686.0,
"step": 1369
},
{
"epoch": 0.8768,
"grad_norm": 4.0010271072387695,
"learning_rate": 5e-06,
"loss": 1.2605,
"mean_token_accuracy": 0.660023458302021,
"num_tokens": 17812008.0,
"step": 1370
},
{
"epoch": 0.87744,
"grad_norm": 3.8963913917541504,
"learning_rate": 5e-06,
"loss": 1.0612,
"mean_token_accuracy": 0.6989761069417,
"num_tokens": 17822062.0,
"step": 1371
},
{
"epoch": 0.87808,
"grad_norm": 3.409618854522705,
"learning_rate": 5e-06,
"loss": 1.5595,
"mean_token_accuracy": 0.6186339408159256,
"num_tokens": 17836142.0,
"step": 1372
},
{
"epoch": 0.87872,
"grad_norm": 2.955591917037964,
"learning_rate": 5e-06,
"loss": 1.2973,
"mean_token_accuracy": 0.6578470319509506,
"num_tokens": 17850508.0,
"step": 1373
},
{
"epoch": 0.87936,
"grad_norm": 3.400749921798706,
"learning_rate": 5e-06,
"loss": 1.1947,
"mean_token_accuracy": 0.6701619401574135,
"num_tokens": 17864240.0,
"step": 1374
},
{
"epoch": 0.88,
"grad_norm": 3.2822978496551514,
"learning_rate": 5e-06,
"loss": 1.46,
"mean_token_accuracy": 0.6225104928016663,
"num_tokens": 17879022.0,
"step": 1375
},
{
"epoch": 0.88064,
"grad_norm": 3.9761667251586914,
"learning_rate": 5e-06,
"loss": 1.1623,
"mean_token_accuracy": 0.6682100668549538,
"num_tokens": 17890289.0,
"step": 1376
},
{
"epoch": 0.88128,
"grad_norm": 3.6653897762298584,
"learning_rate": 5e-06,
"loss": 1.3524,
"mean_token_accuracy": 0.6353943534195423,
"num_tokens": 17903080.0,
"step": 1377
},
{
"epoch": 0.88192,
"grad_norm": 4.603322505950928,
"learning_rate": 5e-06,
"loss": 1.6278,
"mean_token_accuracy": 0.6351469904184341,
"num_tokens": 17912567.0,
"step": 1378
},
{
"epoch": 0.88256,
"grad_norm": 3.411752700805664,
"learning_rate": 5e-06,
"loss": 1.2195,
"mean_token_accuracy": 0.675203487277031,
"num_tokens": 17927030.0,
"step": 1379
},
{
"epoch": 0.8832,
"grad_norm": 4.03117036819458,
"learning_rate": 5e-06,
"loss": 1.1379,
"mean_token_accuracy": 0.6800655201077461,
"num_tokens": 17936846.0,
"step": 1380
},
{
"epoch": 0.88384,
"grad_norm": 3.4626095294952393,
"learning_rate": 5e-06,
"loss": 1.2256,
"mean_token_accuracy": 0.6742624565958977,
"num_tokens": 17949176.0,
"step": 1381
},
{
"epoch": 0.88448,
"grad_norm": 3.326813220977783,
"learning_rate": 5e-06,
"loss": 1.2921,
"mean_token_accuracy": 0.6827266663312912,
"num_tokens": 17962574.0,
"step": 1382
},
{
"epoch": 0.88512,
"grad_norm": 3.539931535720825,
"learning_rate": 5e-06,
"loss": 1.0815,
"mean_token_accuracy": 0.7009731009602547,
"num_tokens": 17975312.0,
"step": 1383
},
{
"epoch": 0.88576,
"grad_norm": 3.1076414585113525,
"learning_rate": 5e-06,
"loss": 1.2004,
"mean_token_accuracy": 0.6538084149360657,
"num_tokens": 17992044.0,
"step": 1384
},
{
"epoch": 0.8864,
"grad_norm": 3.54392147064209,
"learning_rate": 5e-06,
"loss": 1.3749,
"mean_token_accuracy": 0.6528435945510864,
"num_tokens": 18004880.0,
"step": 1385
},
{
"epoch": 0.88704,
"grad_norm": 3.1049365997314453,
"learning_rate": 5e-06,
"loss": 1.4558,
"mean_token_accuracy": 0.6254525259137154,
"num_tokens": 18018899.0,
"step": 1386
},
{
"epoch": 0.88768,
"grad_norm": 3.872276782989502,
"learning_rate": 5e-06,
"loss": 1.3721,
"mean_token_accuracy": 0.6540368646383286,
"num_tokens": 18031425.0,
"step": 1387
},
{
"epoch": 0.88832,
"grad_norm": 4.218468189239502,
"learning_rate": 5e-06,
"loss": 1.1603,
"mean_token_accuracy": 0.6857082098722458,
"num_tokens": 18042604.0,
"step": 1388
},
{
"epoch": 0.88896,
"grad_norm": 3.564180374145508,
"learning_rate": 5e-06,
"loss": 1.3641,
"mean_token_accuracy": 0.6548919975757599,
"num_tokens": 18055105.0,
"step": 1389
},
{
"epoch": 0.8896,
"grad_norm": 3.4216361045837402,
"learning_rate": 5e-06,
"loss": 1.3133,
"mean_token_accuracy": 0.6648430451750755,
"num_tokens": 18068895.0,
"step": 1390
},
{
"epoch": 0.89024,
"grad_norm": 3.466216564178467,
"learning_rate": 5e-06,
"loss": 1.426,
"mean_token_accuracy": 0.675171747803688,
"num_tokens": 18082806.0,
"step": 1391
},
{
"epoch": 0.89088,
"grad_norm": 4.009366512298584,
"learning_rate": 5e-06,
"loss": 1.151,
"mean_token_accuracy": 0.6659594774246216,
"num_tokens": 18093287.0,
"step": 1392
},
{
"epoch": 0.89152,
"grad_norm": 3.287865161895752,
"learning_rate": 5e-06,
"loss": 1.1818,
"mean_token_accuracy": 0.697388269007206,
"num_tokens": 18107755.0,
"step": 1393
},
{
"epoch": 0.89216,
"grad_norm": 3.865363597869873,
"learning_rate": 5e-06,
"loss": 1.2494,
"mean_token_accuracy": 0.6640971228480339,
"num_tokens": 18118665.0,
"step": 1394
},
{
"epoch": 0.8928,
"grad_norm": 3.694581985473633,
"learning_rate": 5e-06,
"loss": 1.2886,
"mean_token_accuracy": 0.6522090062499046,
"num_tokens": 18130135.0,
"step": 1395
},
{
"epoch": 0.89344,
"grad_norm": 3.5079498291015625,
"learning_rate": 5e-06,
"loss": 1.2224,
"mean_token_accuracy": 0.6756654903292656,
"num_tokens": 18143669.0,
"step": 1396
},
{
"epoch": 0.89408,
"grad_norm": 3.9231410026550293,
"learning_rate": 5e-06,
"loss": 1.2382,
"mean_token_accuracy": 0.6565620601177216,
"num_tokens": 18155787.0,
"step": 1397
},
{
"epoch": 0.89472,
"grad_norm": 3.2922706604003906,
"learning_rate": 5e-06,
"loss": 1.3624,
"mean_token_accuracy": 0.657343290746212,
"num_tokens": 18169330.0,
"step": 1398
},
{
"epoch": 0.89536,
"grad_norm": 4.219677448272705,
"learning_rate": 5e-06,
"loss": 1.3696,
"mean_token_accuracy": 0.6795709133148193,
"num_tokens": 18181111.0,
"step": 1399
},
{
"epoch": 0.896,
"grad_norm": 3.3847157955169678,
"learning_rate": 5e-06,
"loss": 1.2803,
"mean_token_accuracy": 0.6801963672041893,
"num_tokens": 18194826.0,
"step": 1400
},
{
"epoch": 0.89664,
"grad_norm": 3.3101882934570312,
"learning_rate": 5e-06,
"loss": 1.2146,
"mean_token_accuracy": 0.6810724586248398,
"num_tokens": 18207891.0,
"step": 1401
},
{
"epoch": 0.89728,
"grad_norm": 4.586159706115723,
"learning_rate": 5e-06,
"loss": 1.234,
"mean_token_accuracy": 0.6616763696074486,
"num_tokens": 18219068.0,
"step": 1402
},
{
"epoch": 0.89792,
"grad_norm": 2.9213805198669434,
"learning_rate": 5e-06,
"loss": 1.5959,
"mean_token_accuracy": 0.6127992421388626,
"num_tokens": 18234945.0,
"step": 1403
},
{
"epoch": 0.89856,
"grad_norm": 3.180678606033325,
"learning_rate": 5e-06,
"loss": 1.2227,
"mean_token_accuracy": 0.680058054625988,
"num_tokens": 18249768.0,
"step": 1404
},
{
"epoch": 0.8992,
"grad_norm": 3.4679532051086426,
"learning_rate": 5e-06,
"loss": 1.2374,
"mean_token_accuracy": 0.67696313560009,
"num_tokens": 18265924.0,
"step": 1405
},
{
"epoch": 0.89984,
"grad_norm": 3.4234979152679443,
"learning_rate": 5e-06,
"loss": 1.5505,
"mean_token_accuracy": 0.6274235621094704,
"num_tokens": 18280819.0,
"step": 1406
},
{
"epoch": 0.90048,
"grad_norm": 4.96069860458374,
"learning_rate": 5e-06,
"loss": 1.4236,
"mean_token_accuracy": 0.607517022639513,
"num_tokens": 18291686.0,
"step": 1407
},
{
"epoch": 0.90112,
"grad_norm": 3.1977005004882812,
"learning_rate": 5e-06,
"loss": 1.3486,
"mean_token_accuracy": 0.6483859121799469,
"num_tokens": 18304993.0,
"step": 1408
},
{
"epoch": 0.90176,
"grad_norm": 3.5749099254608154,
"learning_rate": 5e-06,
"loss": 1.2922,
"mean_token_accuracy": 0.6452238261699677,
"num_tokens": 18319373.0,
"step": 1409
},
{
"epoch": 0.9024,
"grad_norm": 3.388899803161621,
"learning_rate": 5e-06,
"loss": 1.3281,
"mean_token_accuracy": 0.6475742906332016,
"num_tokens": 18331998.0,
"step": 1410
},
{
"epoch": 0.90304,
"grad_norm": 3.4031882286071777,
"learning_rate": 5e-06,
"loss": 1.3355,
"mean_token_accuracy": 0.6812806725502014,
"num_tokens": 18344632.0,
"step": 1411
},
{
"epoch": 0.90368,
"grad_norm": 3.8880221843719482,
"learning_rate": 5e-06,
"loss": 1.3898,
"mean_token_accuracy": 0.6473888382315636,
"num_tokens": 18356350.0,
"step": 1412
},
{
"epoch": 0.90432,
"grad_norm": 3.5985724925994873,
"learning_rate": 5e-06,
"loss": 1.2345,
"mean_token_accuracy": 0.6557316966354847,
"num_tokens": 18368400.0,
"step": 1413
},
{
"epoch": 0.90496,
"grad_norm": 3.6234962940216064,
"learning_rate": 5e-06,
"loss": 1.1942,
"mean_token_accuracy": 0.6906508356332779,
"num_tokens": 18379118.0,
"step": 1414
},
{
"epoch": 0.9056,
"grad_norm": 3.8934993743896484,
"learning_rate": 5e-06,
"loss": 1.2382,
"mean_token_accuracy": 0.6724176928400993,
"num_tokens": 18391595.0,
"step": 1415
},
{
"epoch": 0.90624,
"grad_norm": 3.603591203689575,
"learning_rate": 5e-06,
"loss": 1.3737,
"mean_token_accuracy": 0.6498560681939125,
"num_tokens": 18403595.0,
"step": 1416
},
{
"epoch": 0.90688,
"grad_norm": 3.2106738090515137,
"learning_rate": 5e-06,
"loss": 1.2911,
"mean_token_accuracy": 0.6614857837557793,
"num_tokens": 18418034.0,
"step": 1417
},
{
"epoch": 0.90752,
"grad_norm": 3.0255284309387207,
"learning_rate": 5e-06,
"loss": 1.2975,
"mean_token_accuracy": 0.653803177177906,
"num_tokens": 18434798.0,
"step": 1418
},
{
"epoch": 0.90816,
"grad_norm": 3.696108818054199,
"learning_rate": 5e-06,
"loss": 1.3184,
"mean_token_accuracy": 0.6341993510723114,
"num_tokens": 18446612.0,
"step": 1419
},
{
"epoch": 0.9088,
"grad_norm": 4.0753254890441895,
"learning_rate": 5e-06,
"loss": 1.2244,
"mean_token_accuracy": 0.6535854563117027,
"num_tokens": 18458141.0,
"step": 1420
},
{
"epoch": 0.90944,
"grad_norm": 3.655604124069214,
"learning_rate": 5e-06,
"loss": 1.3088,
"mean_token_accuracy": 0.6778343543410301,
"num_tokens": 18471653.0,
"step": 1421
},
{
"epoch": 0.91008,
"grad_norm": 3.4860193729400635,
"learning_rate": 5e-06,
"loss": 1.2065,
"mean_token_accuracy": 0.6871431916952133,
"num_tokens": 18482903.0,
"step": 1422
},
{
"epoch": 0.91072,
"grad_norm": 3.5701212882995605,
"learning_rate": 5e-06,
"loss": 1.161,
"mean_token_accuracy": 0.6876110881567001,
"num_tokens": 18495519.0,
"step": 1423
},
{
"epoch": 0.91136,
"grad_norm": 4.311164855957031,
"learning_rate": 5e-06,
"loss": 1.2691,
"mean_token_accuracy": 0.6963246017694473,
"num_tokens": 18506391.0,
"step": 1424
},
{
"epoch": 0.912,
"grad_norm": 3.228339672088623,
"learning_rate": 5e-06,
"loss": 1.2486,
"mean_token_accuracy": 0.6647578254342079,
"num_tokens": 18521751.0,
"step": 1425
},
{
"epoch": 0.91264,
"grad_norm": 3.649463176727295,
"learning_rate": 5e-06,
"loss": 1.2265,
"mean_token_accuracy": 0.6655023992061615,
"num_tokens": 18533605.0,
"step": 1426
},
{
"epoch": 0.91328,
"grad_norm": 3.822047710418701,
"learning_rate": 5e-06,
"loss": 1.2303,
"mean_token_accuracy": 0.6853557825088501,
"num_tokens": 18545920.0,
"step": 1427
},
{
"epoch": 0.91392,
"grad_norm": 3.622427463531494,
"learning_rate": 5e-06,
"loss": 1.3153,
"mean_token_accuracy": 0.6682358086109161,
"num_tokens": 18558370.0,
"step": 1428
},
{
"epoch": 0.91456,
"grad_norm": 3.013226270675659,
"learning_rate": 5e-06,
"loss": 1.0413,
"mean_token_accuracy": 0.7230858653783798,
"num_tokens": 18572388.0,
"step": 1429
},
{
"epoch": 0.9152,
"grad_norm": 2.999063730239868,
"learning_rate": 5e-06,
"loss": 1.2757,
"mean_token_accuracy": 0.658422015607357,
"num_tokens": 18587001.0,
"step": 1430
},
{
"epoch": 0.91584,
"grad_norm": 3.246445417404175,
"learning_rate": 5e-06,
"loss": 1.1428,
"mean_token_accuracy": 0.7174563780426979,
"num_tokens": 18600196.0,
"step": 1431
},
{
"epoch": 0.91648,
"grad_norm": 3.52728533744812,
"learning_rate": 5e-06,
"loss": 1.2719,
"mean_token_accuracy": 0.6571086049079895,
"num_tokens": 18612602.0,
"step": 1432
},
{
"epoch": 0.91712,
"grad_norm": 3.3236947059631348,
"learning_rate": 5e-06,
"loss": 1.3722,
"mean_token_accuracy": 0.6516182944178581,
"num_tokens": 18628569.0,
"step": 1433
},
{
"epoch": 0.91776,
"grad_norm": 3.9207522869110107,
"learning_rate": 5e-06,
"loss": 1.289,
"mean_token_accuracy": 0.6646075919270515,
"num_tokens": 18639375.0,
"step": 1434
},
{
"epoch": 0.9184,
"grad_norm": 3.3679165840148926,
"learning_rate": 5e-06,
"loss": 1.3844,
"mean_token_accuracy": 0.6545412912964821,
"num_tokens": 18652531.0,
"step": 1435
},
{
"epoch": 0.91904,
"grad_norm": 3.58003830909729,
"learning_rate": 5e-06,
"loss": 1.3116,
"mean_token_accuracy": 0.655610017478466,
"num_tokens": 18665160.0,
"step": 1436
},
{
"epoch": 0.91968,
"grad_norm": 3.827817916870117,
"learning_rate": 5e-06,
"loss": 1.1945,
"mean_token_accuracy": 0.6569493412971497,
"num_tokens": 18676671.0,
"step": 1437
},
{
"epoch": 0.92032,
"grad_norm": 3.6998956203460693,
"learning_rate": 5e-06,
"loss": 1.5481,
"mean_token_accuracy": 0.6249835789203644,
"num_tokens": 18690078.0,
"step": 1438
},
{
"epoch": 0.92096,
"grad_norm": 3.2389333248138428,
"learning_rate": 5e-06,
"loss": 1.2938,
"mean_token_accuracy": 0.65943942964077,
"num_tokens": 18703678.0,
"step": 1439
},
{
"epoch": 0.9216,
"grad_norm": 2.924175262451172,
"learning_rate": 5e-06,
"loss": 1.2873,
"mean_token_accuracy": 0.6494470685720444,
"num_tokens": 18719576.0,
"step": 1440
},
{
"epoch": 0.92224,
"grad_norm": 3.7290942668914795,
"learning_rate": 5e-06,
"loss": 1.2667,
"mean_token_accuracy": 0.6728792116045952,
"num_tokens": 18732712.0,
"step": 1441
},
{
"epoch": 0.92288,
"grad_norm": 3.406003952026367,
"learning_rate": 5e-06,
"loss": 1.1128,
"mean_token_accuracy": 0.7027332484722137,
"num_tokens": 18745929.0,
"step": 1442
},
{
"epoch": 0.92352,
"grad_norm": 3.9130918979644775,
"learning_rate": 5e-06,
"loss": 1.1714,
"mean_token_accuracy": 0.6731210052967072,
"num_tokens": 18755977.0,
"step": 1443
},
{
"epoch": 0.92416,
"grad_norm": 3.678868055343628,
"learning_rate": 5e-06,
"loss": 1.3613,
"mean_token_accuracy": 0.6376957893371582,
"num_tokens": 18767848.0,
"step": 1444
},
{
"epoch": 0.9248,
"grad_norm": 3.355009078979492,
"learning_rate": 5e-06,
"loss": 1.4501,
"mean_token_accuracy": 0.6530297324061394,
"num_tokens": 18781692.0,
"step": 1445
},
{
"epoch": 0.92544,
"grad_norm": 3.197375774383545,
"learning_rate": 5e-06,
"loss": 1.4667,
"mean_token_accuracy": 0.6258358731865883,
"num_tokens": 18796361.0,
"step": 1446
},
{
"epoch": 0.92608,
"grad_norm": 3.364900588989258,
"learning_rate": 5e-06,
"loss": 1.4204,
"mean_token_accuracy": 0.6358629465103149,
"num_tokens": 18810771.0,
"step": 1447
},
{
"epoch": 0.92672,
"grad_norm": 3.323707342147827,
"learning_rate": 5e-06,
"loss": 1.1537,
"mean_token_accuracy": 0.700812466442585,
"num_tokens": 18824895.0,
"step": 1448
},
{
"epoch": 0.92736,
"grad_norm": 3.5423851013183594,
"learning_rate": 5e-06,
"loss": 1.1198,
"mean_token_accuracy": 0.6927414685487747,
"num_tokens": 18838244.0,
"step": 1449
},
{
"epoch": 0.928,
"grad_norm": 3.5557827949523926,
"learning_rate": 5e-06,
"loss": 1.3942,
"mean_token_accuracy": 0.6344395503401756,
"num_tokens": 18850747.0,
"step": 1450
},
{
"epoch": 0.92864,
"grad_norm": 3.8772428035736084,
"learning_rate": 5e-06,
"loss": 1.1849,
"mean_token_accuracy": 0.6797264739871025,
"num_tokens": 18863209.0,
"step": 1451
},
{
"epoch": 0.92928,
"grad_norm": 3.387641668319702,
"learning_rate": 5e-06,
"loss": 1.4152,
"mean_token_accuracy": 0.6333313882350922,
"num_tokens": 18876056.0,
"step": 1452
},
{
"epoch": 0.92992,
"grad_norm": 3.554407835006714,
"learning_rate": 5e-06,
"loss": 1.1832,
"mean_token_accuracy": 0.6640536859631538,
"num_tokens": 18890920.0,
"step": 1453
},
{
"epoch": 0.93056,
"grad_norm": 3.302236795425415,
"learning_rate": 5e-06,
"loss": 1.5489,
"mean_token_accuracy": 0.6134847179055214,
"num_tokens": 18905793.0,
"step": 1454
},
{
"epoch": 0.9312,
"grad_norm": 3.531574010848999,
"learning_rate": 5e-06,
"loss": 1.2801,
"mean_token_accuracy": 0.6507202833890915,
"num_tokens": 18920224.0,
"step": 1455
},
{
"epoch": 0.93184,
"grad_norm": 3.5933139324188232,
"learning_rate": 5e-06,
"loss": 1.3922,
"mean_token_accuracy": 0.6551200449466705,
"num_tokens": 18932613.0,
"step": 1456
},
{
"epoch": 0.93248,
"grad_norm": 3.254462480545044,
"learning_rate": 5e-06,
"loss": 1.3985,
"mean_token_accuracy": 0.6505570337176323,
"num_tokens": 18946774.0,
"step": 1457
},
{
"epoch": 0.93312,
"grad_norm": 3.2945821285247803,
"learning_rate": 5e-06,
"loss": 1.5279,
"mean_token_accuracy": 0.6084811314940453,
"num_tokens": 18961275.0,
"step": 1458
},
{
"epoch": 0.93376,
"grad_norm": 3.2776741981506348,
"learning_rate": 5e-06,
"loss": 1.3401,
"mean_token_accuracy": 0.640129804611206,
"num_tokens": 18975529.0,
"step": 1459
},
{
"epoch": 0.9344,
"grad_norm": 3.2493832111358643,
"learning_rate": 5e-06,
"loss": 1.1077,
"mean_token_accuracy": 0.6907928735017776,
"num_tokens": 18988267.0,
"step": 1460
},
{
"epoch": 0.93504,
"grad_norm": 3.765650987625122,
"learning_rate": 5e-06,
"loss": 1.3092,
"mean_token_accuracy": 0.6711199656128883,
"num_tokens": 19000229.0,
"step": 1461
},
{
"epoch": 0.93568,
"grad_norm": 3.1340558528900146,
"learning_rate": 5e-06,
"loss": 1.4336,
"mean_token_accuracy": 0.6485133692622185,
"num_tokens": 19014356.0,
"step": 1462
},
{
"epoch": 0.93632,
"grad_norm": 3.672553300857544,
"learning_rate": 5e-06,
"loss": 1.1751,
"mean_token_accuracy": 0.664104662835598,
"num_tokens": 19025717.0,
"step": 1463
},
{
"epoch": 0.93696,
"grad_norm": 3.753906726837158,
"learning_rate": 5e-06,
"loss": 1.1003,
"mean_token_accuracy": 0.6995716020464897,
"num_tokens": 19037864.0,
"step": 1464
},
{
"epoch": 0.9376,
"grad_norm": 3.1207399368286133,
"learning_rate": 5e-06,
"loss": 1.2334,
"mean_token_accuracy": 0.6692882552742958,
"num_tokens": 19052336.0,
"step": 1465
},
{
"epoch": 0.93824,
"grad_norm": 3.639620065689087,
"learning_rate": 5e-06,
"loss": 1.396,
"mean_token_accuracy": 0.6677844971418381,
"num_tokens": 19065183.0,
"step": 1466
},
{
"epoch": 0.93888,
"grad_norm": 3.5665981769561768,
"learning_rate": 5e-06,
"loss": 1.3489,
"mean_token_accuracy": 0.66384107619524,
"num_tokens": 19078765.0,
"step": 1467
},
{
"epoch": 0.93952,
"grad_norm": 3.5918264389038086,
"learning_rate": 5e-06,
"loss": 1.4087,
"mean_token_accuracy": 0.6427194476127625,
"num_tokens": 19091098.0,
"step": 1468
},
{
"epoch": 0.94016,
"grad_norm": 3.3692591190338135,
"learning_rate": 5e-06,
"loss": 1.3897,
"mean_token_accuracy": 0.6431680992245674,
"num_tokens": 19105664.0,
"step": 1469
},
{
"epoch": 0.9408,
"grad_norm": 3.6854288578033447,
"learning_rate": 5e-06,
"loss": 1.3319,
"mean_token_accuracy": 0.6552760303020477,
"num_tokens": 19118215.0,
"step": 1470
},
{
"epoch": 0.94144,
"grad_norm": 3.3998701572418213,
"learning_rate": 5e-06,
"loss": 1.1683,
"mean_token_accuracy": 0.675237774848938,
"num_tokens": 19130126.0,
"step": 1471
},
{
"epoch": 0.94208,
"grad_norm": 3.5668833255767822,
"learning_rate": 5e-06,
"loss": 1.4991,
"mean_token_accuracy": 0.6222522705793381,
"num_tokens": 19142375.0,
"step": 1472
},
{
"epoch": 0.94272,
"grad_norm": 3.275745153427124,
"learning_rate": 5e-06,
"loss": 1.3953,
"mean_token_accuracy": 0.6239468678832054,
"num_tokens": 19157943.0,
"step": 1473
},
{
"epoch": 0.94336,
"grad_norm": 4.061445236206055,
"learning_rate": 5e-06,
"loss": 1.3817,
"mean_token_accuracy": 0.6464495584368706,
"num_tokens": 19169261.0,
"step": 1474
},
{
"epoch": 0.944,
"grad_norm": 3.1921486854553223,
"learning_rate": 5e-06,
"loss": 1.284,
"mean_token_accuracy": 0.6610319390892982,
"num_tokens": 19184566.0,
"step": 1475
},
{
"epoch": 0.94464,
"grad_norm": 3.192448139190674,
"learning_rate": 5e-06,
"loss": 1.2544,
"mean_token_accuracy": 0.670927107334137,
"num_tokens": 19199161.0,
"step": 1476
},
{
"epoch": 0.94528,
"grad_norm": 3.534567356109619,
"learning_rate": 5e-06,
"loss": 1.2898,
"mean_token_accuracy": 0.6620035171508789,
"num_tokens": 19210216.0,
"step": 1477
},
{
"epoch": 0.94592,
"grad_norm": 3.4070894718170166,
"learning_rate": 5e-06,
"loss": 1.2067,
"mean_token_accuracy": 0.665832906961441,
"num_tokens": 19222748.0,
"step": 1478
},
{
"epoch": 0.94656,
"grad_norm": 3.373779058456421,
"learning_rate": 5e-06,
"loss": 1.3304,
"mean_token_accuracy": 0.6508694216609001,
"num_tokens": 19236471.0,
"step": 1479
},
{
"epoch": 0.9472,
"grad_norm": 3.518333911895752,
"learning_rate": 5e-06,
"loss": 1.4454,
"mean_token_accuracy": 0.645517073571682,
"num_tokens": 19249438.0,
"step": 1480
},
{
"epoch": 0.94784,
"grad_norm": 3.995748519897461,
"learning_rate": 5e-06,
"loss": 1.4204,
"mean_token_accuracy": 0.6810062602162361,
"num_tokens": 19262043.0,
"step": 1481
},
{
"epoch": 0.94848,
"grad_norm": 3.0706183910369873,
"learning_rate": 5e-06,
"loss": 1.0148,
"mean_token_accuracy": 0.7076255902647972,
"num_tokens": 19277307.0,
"step": 1482
},
{
"epoch": 0.94912,
"grad_norm": 3.0978240966796875,
"learning_rate": 5e-06,
"loss": 1.3144,
"mean_token_accuracy": 0.6533934101462364,
"num_tokens": 19292657.0,
"step": 1483
},
{
"epoch": 0.94976,
"grad_norm": 3.988011121749878,
"learning_rate": 5e-06,
"loss": 1.3691,
"mean_token_accuracy": 0.6342190653085709,
"num_tokens": 19303123.0,
"step": 1484
},
{
"epoch": 0.9504,
"grad_norm": 3.7990894317626953,
"learning_rate": 5e-06,
"loss": 1.107,
"mean_token_accuracy": 0.7004605457186699,
"num_tokens": 19314275.0,
"step": 1485
},
{
"epoch": 0.95104,
"grad_norm": 3.5531113147735596,
"learning_rate": 5e-06,
"loss": 1.3478,
"mean_token_accuracy": 0.6372592151165009,
"num_tokens": 19327717.0,
"step": 1486
},
{
"epoch": 0.95168,
"grad_norm": 3.129286050796509,
"learning_rate": 5e-06,
"loss": 1.5809,
"mean_token_accuracy": 0.6213468164205551,
"num_tokens": 19341237.0,
"step": 1487
},
{
"epoch": 0.95232,
"grad_norm": 3.394064426422119,
"learning_rate": 5e-06,
"loss": 1.3591,
"mean_token_accuracy": 0.6372789964079857,
"num_tokens": 19355577.0,
"step": 1488
},
{
"epoch": 0.95296,
"grad_norm": 3.2110018730163574,
"learning_rate": 5e-06,
"loss": 1.2399,
"mean_token_accuracy": 0.679095022380352,
"num_tokens": 19371326.0,
"step": 1489
},
{
"epoch": 0.9536,
"grad_norm": 3.3202333450317383,
"learning_rate": 5e-06,
"loss": 1.3916,
"mean_token_accuracy": 0.6611816883087158,
"num_tokens": 19385868.0,
"step": 1490
},
{
"epoch": 0.95424,
"grad_norm": 3.5390098094940186,
"learning_rate": 5e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.6761639937758446,
"num_tokens": 19398025.0,
"step": 1491
},
{
"epoch": 0.95488,
"grad_norm": 3.390742778778076,
"learning_rate": 5e-06,
"loss": 1.5499,
"mean_token_accuracy": 0.6107296124100685,
"num_tokens": 19412343.0,
"step": 1492
},
{
"epoch": 0.95552,
"grad_norm": 2.821200132369995,
"learning_rate": 5e-06,
"loss": 1.2155,
"mean_token_accuracy": 0.6577084437012672,
"num_tokens": 19428748.0,
"step": 1493
},
{
"epoch": 0.95616,
"grad_norm": 3.292036771774292,
"learning_rate": 5e-06,
"loss": 1.3155,
"mean_token_accuracy": 0.644202746450901,
"num_tokens": 19440656.0,
"step": 1494
},
{
"epoch": 0.9568,
"grad_norm": 3.416463851928711,
"learning_rate": 5e-06,
"loss": 1.2269,
"mean_token_accuracy": 0.6907675266265869,
"num_tokens": 19452544.0,
"step": 1495
},
{
"epoch": 0.95744,
"grad_norm": 3.6329751014709473,
"learning_rate": 5e-06,
"loss": 1.3323,
"mean_token_accuracy": 0.6382646858692169,
"num_tokens": 19465315.0,
"step": 1496
},
{
"epoch": 0.95808,
"grad_norm": 3.5367205142974854,
"learning_rate": 5e-06,
"loss": 1.373,
"mean_token_accuracy": 0.6586090922355652,
"num_tokens": 19480115.0,
"step": 1497
},
{
"epoch": 0.95872,
"grad_norm": 3.5177509784698486,
"learning_rate": 5e-06,
"loss": 1.2388,
"mean_token_accuracy": 0.6645878851413727,
"num_tokens": 19494388.0,
"step": 1498
},
{
"epoch": 0.95936,
"grad_norm": 3.709169626235962,
"learning_rate": 5e-06,
"loss": 1.3733,
"mean_token_accuracy": 0.6607565060257912,
"num_tokens": 19505621.0,
"step": 1499
},
{
"epoch": 0.96,
"grad_norm": 3.3196604251861572,
"learning_rate": 5e-06,
"loss": 1.1325,
"mean_token_accuracy": 0.6826166063547134,
"num_tokens": 19519830.0,
"step": 1500
},
{
"epoch": 0.96064,
"grad_norm": 4.17763090133667,
"learning_rate": 5e-06,
"loss": 1.2355,
"mean_token_accuracy": 0.6763554587960243,
"num_tokens": 19532118.0,
"step": 1501
},
{
"epoch": 0.96128,
"grad_norm": 3.9797887802124023,
"learning_rate": 5e-06,
"loss": 0.9252,
"mean_token_accuracy": 0.7308862134814262,
"num_tokens": 19543422.0,
"step": 1502
},
{
"epoch": 0.96192,
"grad_norm": 3.3593435287475586,
"learning_rate": 5e-06,
"loss": 1.121,
"mean_token_accuracy": 0.6892295926809311,
"num_tokens": 19555897.0,
"step": 1503
},
{
"epoch": 0.96256,
"grad_norm": 3.6559438705444336,
"learning_rate": 5e-06,
"loss": 1.1375,
"mean_token_accuracy": 0.6769029051065445,
"num_tokens": 19567248.0,
"step": 1504
},
{
"epoch": 0.9632,
"grad_norm": 3.6883292198181152,
"learning_rate": 5e-06,
"loss": 1.3164,
"mean_token_accuracy": 0.643324077129364,
"num_tokens": 19579310.0,
"step": 1505
},
{
"epoch": 0.96384,
"grad_norm": 3.5200116634368896,
"learning_rate": 5e-06,
"loss": 1.2694,
"mean_token_accuracy": 0.6747664734721184,
"num_tokens": 19592537.0,
"step": 1506
},
{
"epoch": 0.96448,
"grad_norm": 3.3167619705200195,
"learning_rate": 5e-06,
"loss": 1.2958,
"mean_token_accuracy": 0.6770147830247879,
"num_tokens": 19606932.0,
"step": 1507
},
{
"epoch": 0.96512,
"grad_norm": 2.7224249839782715,
"learning_rate": 5e-06,
"loss": 1.3125,
"mean_token_accuracy": 0.6614532843232155,
"num_tokens": 19624296.0,
"step": 1508
},
{
"epoch": 0.96576,
"grad_norm": 3.4137089252471924,
"learning_rate": 5e-06,
"loss": 1.2778,
"mean_token_accuracy": 0.662353903055191,
"num_tokens": 19637049.0,
"step": 1509
},
{
"epoch": 0.9664,
"grad_norm": 3.7370848655700684,
"learning_rate": 5e-06,
"loss": 1.4503,
"mean_token_accuracy": 0.639873132109642,
"num_tokens": 19649788.0,
"step": 1510
},
{
"epoch": 0.96704,
"grad_norm": 3.4333293437957764,
"learning_rate": 5e-06,
"loss": 1.4996,
"mean_token_accuracy": 0.63913669064641,
"num_tokens": 19662956.0,
"step": 1511
},
{
"epoch": 0.96768,
"grad_norm": 3.8436150550842285,
"learning_rate": 5e-06,
"loss": 1.2372,
"mean_token_accuracy": 0.6666671261191368,
"num_tokens": 19674701.0,
"step": 1512
},
{
"epoch": 0.96832,
"grad_norm": 3.4364569187164307,
"learning_rate": 5e-06,
"loss": 1.4256,
"mean_token_accuracy": 0.6375450566411018,
"num_tokens": 19688356.0,
"step": 1513
},
{
"epoch": 0.96896,
"grad_norm": 3.1849286556243896,
"learning_rate": 5e-06,
"loss": 1.3019,
"mean_token_accuracy": 0.654203861951828,
"num_tokens": 19703055.0,
"step": 1514
},
{
"epoch": 0.9696,
"grad_norm": 3.790954828262329,
"learning_rate": 5e-06,
"loss": 1.1957,
"mean_token_accuracy": 0.6805417165160179,
"num_tokens": 19715360.0,
"step": 1515
},
{
"epoch": 0.97024,
"grad_norm": 3.696563243865967,
"learning_rate": 5e-06,
"loss": 1.2499,
"mean_token_accuracy": 0.6584246829152107,
"num_tokens": 19726044.0,
"step": 1516
},
{
"epoch": 0.97088,
"grad_norm": 4.10850191116333,
"learning_rate": 5e-06,
"loss": 1.4378,
"mean_token_accuracy": 0.6355271711945534,
"num_tokens": 19739139.0,
"step": 1517
},
{
"epoch": 0.97152,
"grad_norm": 3.1323556900024414,
"learning_rate": 5e-06,
"loss": 1.3652,
"mean_token_accuracy": 0.6413158774375916,
"num_tokens": 19753058.0,
"step": 1518
},
{
"epoch": 0.97216,
"grad_norm": 3.334622859954834,
"learning_rate": 5e-06,
"loss": 1.2963,
"mean_token_accuracy": 0.6517771631479263,
"num_tokens": 19765569.0,
"step": 1519
},
{
"epoch": 0.9728,
"grad_norm": 5.364054203033447,
"learning_rate": 5e-06,
"loss": 1.1438,
"mean_token_accuracy": 0.6952068582177162,
"num_tokens": 19778058.0,
"step": 1520
},
{
"epoch": 0.97344,
"grad_norm": 3.416874408721924,
"learning_rate": 5e-06,
"loss": 1.1759,
"mean_token_accuracy": 0.6735500246286392,
"num_tokens": 19792728.0,
"step": 1521
},
{
"epoch": 0.97408,
"grad_norm": 3.164233922958374,
"learning_rate": 5e-06,
"loss": 1.1211,
"mean_token_accuracy": 0.6952219977974892,
"num_tokens": 19807085.0,
"step": 1522
},
{
"epoch": 0.97472,
"grad_norm": 3.73028564453125,
"learning_rate": 5e-06,
"loss": 1.3345,
"mean_token_accuracy": 0.6841987073421478,
"num_tokens": 19821681.0,
"step": 1523
},
{
"epoch": 0.97536,
"grad_norm": 3.401895761489868,
"learning_rate": 5e-06,
"loss": 1.3681,
"mean_token_accuracy": 0.6333037242293358,
"num_tokens": 19834796.0,
"step": 1524
},
{
"epoch": 0.976,
"grad_norm": 3.8067119121551514,
"learning_rate": 5e-06,
"loss": 1.0905,
"mean_token_accuracy": 0.6978934183716774,
"num_tokens": 19846639.0,
"step": 1525
},
{
"epoch": 0.97664,
"grad_norm": 3.070439338684082,
"learning_rate": 5e-06,
"loss": 1.2461,
"mean_token_accuracy": 0.653811477124691,
"num_tokens": 19860465.0,
"step": 1526
},
{
"epoch": 0.97728,
"grad_norm": 3.186588764190674,
"learning_rate": 5e-06,
"loss": 1.1821,
"mean_token_accuracy": 0.7026697173714638,
"num_tokens": 19876272.0,
"step": 1527
},
{
"epoch": 0.97792,
"grad_norm": 3.122529983520508,
"learning_rate": 5e-06,
"loss": 1.1799,
"mean_token_accuracy": 0.6770785599946976,
"num_tokens": 19892221.0,
"step": 1528
},
{
"epoch": 0.97856,
"grad_norm": 3.7920093536376953,
"learning_rate": 5e-06,
"loss": 1.3852,
"mean_token_accuracy": 0.6517146974802017,
"num_tokens": 19903219.0,
"step": 1529
},
{
"epoch": 0.9792,
"grad_norm": 3.9800093173980713,
"learning_rate": 5e-06,
"loss": 1.3666,
"mean_token_accuracy": 0.6671213582158089,
"num_tokens": 19914283.0,
"step": 1530
},
{
"epoch": 0.97984,
"grad_norm": 4.115480899810791,
"learning_rate": 5e-06,
"loss": 1.6462,
"mean_token_accuracy": 0.6288831681013107,
"num_tokens": 19924831.0,
"step": 1531
},
{
"epoch": 0.98048,
"grad_norm": 3.8407366275787354,
"learning_rate": 5e-06,
"loss": 1.3123,
"mean_token_accuracy": 0.646103672683239,
"num_tokens": 19935368.0,
"step": 1532
},
{
"epoch": 0.98112,
"grad_norm": 3.036931276321411,
"learning_rate": 5e-06,
"loss": 1.3947,
"mean_token_accuracy": 0.6490734815597534,
"num_tokens": 19950888.0,
"step": 1533
},
{
"epoch": 0.98176,
"grad_norm": 3.3416826725006104,
"learning_rate": 5e-06,
"loss": 1.3709,
"mean_token_accuracy": 0.6444736868143082,
"num_tokens": 19964717.0,
"step": 1534
},
{
"epoch": 0.9824,
"grad_norm": 3.184088945388794,
"learning_rate": 5e-06,
"loss": 1.3429,
"mean_token_accuracy": 0.673894077539444,
"num_tokens": 19977976.0,
"step": 1535
},
{
"epoch": 0.98304,
"grad_norm": 3.382946491241455,
"learning_rate": 5e-06,
"loss": 1.481,
"mean_token_accuracy": 0.6425464749336243,
"num_tokens": 19991312.0,
"step": 1536
},
{
"epoch": 0.98368,
"grad_norm": 3.7429699897766113,
"learning_rate": 5e-06,
"loss": 1.2422,
"mean_token_accuracy": 0.6737086698412895,
"num_tokens": 20002300.0,
"step": 1537
},
{
"epoch": 0.98432,
"grad_norm": 3.6931872367858887,
"learning_rate": 5e-06,
"loss": 1.3122,
"mean_token_accuracy": 0.6581440344452858,
"num_tokens": 20015107.0,
"step": 1538
},
{
"epoch": 0.98496,
"grad_norm": 4.0337300300598145,
"learning_rate": 5e-06,
"loss": 1.3912,
"mean_token_accuracy": 0.6898427382111549,
"num_tokens": 20027265.0,
"step": 1539
},
{
"epoch": 0.9856,
"grad_norm": 3.514187812805176,
"learning_rate": 5e-06,
"loss": 1.0613,
"mean_token_accuracy": 0.7012772336602211,
"num_tokens": 20038919.0,
"step": 1540
},
{
"epoch": 0.98624,
"grad_norm": 3.5034477710723877,
"learning_rate": 5e-06,
"loss": 1.4009,
"mean_token_accuracy": 0.6428939253091812,
"num_tokens": 20052482.0,
"step": 1541
},
{
"epoch": 0.98688,
"grad_norm": 3.3519279956817627,
"learning_rate": 5e-06,
"loss": 1.4362,
"mean_token_accuracy": 0.6396335512399673,
"num_tokens": 20067032.0,
"step": 1542
},
{
"epoch": 0.98752,
"grad_norm": 3.7068188190460205,
"learning_rate": 5e-06,
"loss": 1.2301,
"mean_token_accuracy": 0.6891591548919678,
"num_tokens": 20079146.0,
"step": 1543
},
{
"epoch": 0.98816,
"grad_norm": 3.6617250442504883,
"learning_rate": 5e-06,
"loss": 1.211,
"mean_token_accuracy": 0.7132939025759697,
"num_tokens": 20089620.0,
"step": 1544
},
{
"epoch": 0.9888,
"grad_norm": 3.217038631439209,
"learning_rate": 5e-06,
"loss": 1.3661,
"mean_token_accuracy": 0.6576998308300972,
"num_tokens": 20103587.0,
"step": 1545
},
{
"epoch": 0.98944,
"grad_norm": 3.996293783187866,
"learning_rate": 5e-06,
"loss": 1.2923,
"mean_token_accuracy": 0.6637570187449455,
"num_tokens": 20115402.0,
"step": 1546
},
{
"epoch": 0.99008,
"grad_norm": 3.543278932571411,
"learning_rate": 5e-06,
"loss": 1.2429,
"mean_token_accuracy": 0.6742196753621101,
"num_tokens": 20126222.0,
"step": 1547
},
{
"epoch": 0.99072,
"grad_norm": 3.501190662384033,
"learning_rate": 5e-06,
"loss": 1.2304,
"mean_token_accuracy": 0.6541951596736908,
"num_tokens": 20137476.0,
"step": 1548
},
{
"epoch": 0.99136,
"grad_norm": 3.904467821121216,
"learning_rate": 5e-06,
"loss": 1.2723,
"mean_token_accuracy": 0.6750770211219788,
"num_tokens": 20149377.0,
"step": 1549
},
{
"epoch": 0.992,
"grad_norm": 3.557426691055298,
"learning_rate": 5e-06,
"loss": 1.4754,
"mean_token_accuracy": 0.6633486226201057,
"num_tokens": 20161955.0,
"step": 1550
},
{
"epoch": 0.99264,
"grad_norm": 3.5321543216705322,
"learning_rate": 5e-06,
"loss": 1.3909,
"mean_token_accuracy": 0.6640786305069923,
"num_tokens": 20174432.0,
"step": 1551
},
{
"epoch": 0.99328,
"grad_norm": 4.1432929039001465,
"learning_rate": 5e-06,
"loss": 1.2162,
"mean_token_accuracy": 0.6733951196074486,
"num_tokens": 20186656.0,
"step": 1552
},
{
"epoch": 0.99392,
"grad_norm": 3.221876859664917,
"learning_rate": 5e-06,
"loss": 1.2039,
"mean_token_accuracy": 0.6700675636529922,
"num_tokens": 20200325.0,
"step": 1553
},
{
"epoch": 0.99456,
"grad_norm": 3.4923529624938965,
"learning_rate": 5e-06,
"loss": 1.2479,
"mean_token_accuracy": 0.6704057157039642,
"num_tokens": 20211958.0,
"step": 1554
},
{
"epoch": 0.9952,
"grad_norm": 3.4751315116882324,
"learning_rate": 5e-06,
"loss": 1.2513,
"mean_token_accuracy": 0.6954710930585861,
"num_tokens": 20224457.0,
"step": 1555
},
{
"epoch": 0.99584,
"grad_norm": 3.4763216972351074,
"learning_rate": 5e-06,
"loss": 1.1645,
"mean_token_accuracy": 0.6789154633879662,
"num_tokens": 20236259.0,
"step": 1556
},
{
"epoch": 0.99648,
"grad_norm": 3.582597017288208,
"learning_rate": 5e-06,
"loss": 1.383,
"mean_token_accuracy": 0.6580745279788971,
"num_tokens": 20250508.0,
"step": 1557
},
{
"epoch": 0.99712,
"grad_norm": 4.058999061584473,
"learning_rate": 5e-06,
"loss": 1.3162,
"mean_token_accuracy": 0.6591609418392181,
"num_tokens": 20262337.0,
"step": 1558
},
{
"epoch": 0.99776,
"grad_norm": 3.842996597290039,
"learning_rate": 5e-06,
"loss": 1.3768,
"mean_token_accuracy": 0.6542828008532524,
"num_tokens": 20273865.0,
"step": 1559
},
{
"epoch": 0.9984,
"grad_norm": 3.5340254306793213,
"learning_rate": 5e-06,
"loss": 1.2762,
"mean_token_accuracy": 0.6779467761516571,
"num_tokens": 20286723.0,
"step": 1560
},
{
"epoch": 0.99904,
"grad_norm": 3.087484836578369,
"learning_rate": 5e-06,
"loss": 1.3845,
"mean_token_accuracy": 0.6415645852684975,
"num_tokens": 20302849.0,
"step": 1561
},
{
"epoch": 0.99968,
"grad_norm": 3.4678475856781006,
"learning_rate": 5e-06,
"loss": 1.2984,
"mean_token_accuracy": 0.6632586568593979,
"num_tokens": 20315462.0,
"step": 1562
}
],
"logging_steps": 1,
"max_steps": 1562,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 56623305523200.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}