{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 613, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016313213703099511, "grad_norm": 54.38072967529297, "learning_rate": 1.6129032258064518e-07, "loss": 3.9722, "mean_token_accuracy": 0.314461886882782, "step": 1 }, { "epoch": 0.0032626427406199023, "grad_norm": 59.083343505859375, "learning_rate": 3.2258064516129035e-07, "loss": 3.7752, "mean_token_accuracy": 0.3500784933567047, "step": 2 }, { "epoch": 0.004893964110929853, "grad_norm": 52.31679153442383, "learning_rate": 4.838709677419355e-07, "loss": 3.9767, "mean_token_accuracy": 0.32198143005371094, "step": 3 }, { "epoch": 0.0065252854812398045, "grad_norm": 56.8325080871582, "learning_rate": 6.451612903225807e-07, "loss": 3.8677, "mean_token_accuracy": 0.34073251485824585, "step": 4 }, { "epoch": 0.008156606851549755, "grad_norm": 46.90914535522461, "learning_rate": 8.064516129032258e-07, "loss": 3.7833, "mean_token_accuracy": 0.3529976010322571, "step": 5 }, { "epoch": 0.009787928221859706, "grad_norm": 50.84980010986328, "learning_rate": 9.67741935483871e-07, "loss": 3.6046, "mean_token_accuracy": 0.36332181096076965, "step": 6 }, { "epoch": 0.011419249592169658, "grad_norm": 44.124671936035156, "learning_rate": 1.1290322580645162e-06, "loss": 3.5605, "mean_token_accuracy": 0.3784194588661194, "step": 7 }, { "epoch": 0.013050570962479609, "grad_norm": 38.687442779541016, "learning_rate": 1.2903225806451614e-06, "loss": 3.6567, "mean_token_accuracy": 0.36630603671073914, "step": 8 }, { "epoch": 0.01468189233278956, "grad_norm": 32.46002960205078, "learning_rate": 1.4516129032258066e-06, "loss": 3.7645, "mean_token_accuracy": 0.3374135196208954, "step": 9 }, { "epoch": 0.01631321370309951, "grad_norm": 29.601980209350586, "learning_rate": 1.6129032258064516e-06, "loss": 3.735, "mean_token_accuracy": 0.340471088886261, "step": 10 }, { "epoch": 0.01794453507340946, "grad_norm": 25.11663818359375, "learning_rate": 1.774193548387097e-06, "loss": 3.3774, "mean_token_accuracy": 0.3931350111961365, "step": 11 }, { "epoch": 0.01957585644371941, "grad_norm": 18.90343475341797, "learning_rate": 1.935483870967742e-06, "loss": 3.2297, "mean_token_accuracy": 0.41412118077278137, "step": 12 }, { "epoch": 0.021207177814029365, "grad_norm": 21.3724422454834, "learning_rate": 2.096774193548387e-06, "loss": 3.1907, "mean_token_accuracy": 0.43043479323387146, "step": 13 }, { "epoch": 0.022838499184339316, "grad_norm": 18.062108993530273, "learning_rate": 2.2580645161290324e-06, "loss": 3.0692, "mean_token_accuracy": 0.43661972880363464, "step": 14 }, { "epoch": 0.024469820554649267, "grad_norm": 18.955305099487305, "learning_rate": 2.4193548387096776e-06, "loss": 3.0939, "mean_token_accuracy": 0.4284232258796692, "step": 15 }, { "epoch": 0.026101141924959218, "grad_norm": 19.71297264099121, "learning_rate": 2.580645161290323e-06, "loss": 2.9745, "mean_token_accuracy": 0.45571428537368774, "step": 16 }, { "epoch": 0.02773246329526917, "grad_norm": 15.891701698303223, "learning_rate": 2.7419354838709676e-06, "loss": 2.843, "mean_token_accuracy": 0.4642857015132904, "step": 17 }, { "epoch": 0.02936378466557912, "grad_norm": 14.574506759643555, "learning_rate": 2.903225806451613e-06, "loss": 2.6097, "mean_token_accuracy": 0.4918205738067627, "step": 18 }, { "epoch": 0.03099510603588907, "grad_norm": 13.931673049926758, "learning_rate": 3.0645161290322584e-06, "loss": 2.5162, "mean_token_accuracy": 0.5131129026412964, "step": 19 }, { "epoch": 0.03262642740619902, "grad_norm": 13.101471900939941, "learning_rate": 3.225806451612903e-06, "loss": 2.7058, "mean_token_accuracy": 0.47575756907463074, "step": 20 }, { "epoch": 0.03425774877650897, "grad_norm": 12.979852676391602, "learning_rate": 3.3870967741935484e-06, "loss": 2.5696, "mean_token_accuracy": 0.4878854751586914, "step": 21 }, { "epoch": 0.03588907014681892, "grad_norm": 14.335384368896484, "learning_rate": 3.548387096774194e-06, "loss": 2.5031, "mean_token_accuracy": 0.48602256178855896, "step": 22 }, { "epoch": 0.037520391517128875, "grad_norm": 14.542072296142578, "learning_rate": 3.7096774193548392e-06, "loss": 2.4432, "mean_token_accuracy": 0.5048364996910095, "step": 23 }, { "epoch": 0.03915171288743882, "grad_norm": 12.069889068603516, "learning_rate": 3.870967741935484e-06, "loss": 2.2043, "mean_token_accuracy": 0.5519013404846191, "step": 24 }, { "epoch": 0.040783034257748776, "grad_norm": 9.698949813842773, "learning_rate": 4.032258064516129e-06, "loss": 2.0031, "mean_token_accuracy": 0.5868473649024963, "step": 25 }, { "epoch": 0.04241435562805873, "grad_norm": 10.89166259765625, "learning_rate": 4.193548387096774e-06, "loss": 2.3342, "mean_token_accuracy": 0.5324609875679016, "step": 26 }, { "epoch": 0.04404567699836868, "grad_norm": 9.197402000427246, "learning_rate": 4.35483870967742e-06, "loss": 2.2205, "mean_token_accuracy": 0.5475698113441467, "step": 27 }, { "epoch": 0.04567699836867863, "grad_norm": 9.47153377532959, "learning_rate": 4.516129032258065e-06, "loss": 2.0431, "mean_token_accuracy": 0.5686706900596619, "step": 28 }, { "epoch": 0.04730831973898858, "grad_norm": 8.886749267578125, "learning_rate": 4.67741935483871e-06, "loss": 2.1793, "mean_token_accuracy": 0.5322735905647278, "step": 29 }, { "epoch": 0.048939641109298535, "grad_norm": 10.089822769165039, "learning_rate": 4.838709677419355e-06, "loss": 1.987, "mean_token_accuracy": 0.5579903721809387, "step": 30 }, { "epoch": 0.05057096247960848, "grad_norm": 11.309324264526367, "learning_rate": 5e-06, "loss": 2.0749, "mean_token_accuracy": 0.5649139285087585, "step": 31 }, { "epoch": 0.052202283849918436, "grad_norm": 9.036641120910645, "learning_rate": 5.161290322580646e-06, "loss": 2.1629, "mean_token_accuracy": 0.5413948893547058, "step": 32 }, { "epoch": 0.053833605220228384, "grad_norm": 8.936366081237793, "learning_rate": 5.322580645161291e-06, "loss": 1.9053, "mean_token_accuracy": 0.5875675678253174, "step": 33 }, { "epoch": 0.05546492659053834, "grad_norm": 8.523772239685059, "learning_rate": 5.483870967741935e-06, "loss": 1.962, "mean_token_accuracy": 0.5871559381484985, "step": 34 }, { "epoch": 0.057096247960848286, "grad_norm": 8.703071594238281, "learning_rate": 5.645161290322582e-06, "loss": 2.0717, "mean_token_accuracy": 0.5543113350868225, "step": 35 }, { "epoch": 0.05872756933115824, "grad_norm": 8.243901252746582, "learning_rate": 5.806451612903226e-06, "loss": 1.9278, "mean_token_accuracy": 0.5818815231323242, "step": 36 }, { "epoch": 0.06035889070146819, "grad_norm": 8.658400535583496, "learning_rate": 5.967741935483872e-06, "loss": 1.9476, "mean_token_accuracy": 0.5738636255264282, "step": 37 }, { "epoch": 0.06199021207177814, "grad_norm": 8.671000480651855, "learning_rate": 6.129032258064517e-06, "loss": 2.0554, "mean_token_accuracy": 0.5679658055305481, "step": 38 }, { "epoch": 0.0636215334420881, "grad_norm": 9.466026306152344, "learning_rate": 6.290322580645162e-06, "loss": 1.9489, "mean_token_accuracy": 0.5673534274101257, "step": 39 }, { "epoch": 0.06525285481239804, "grad_norm": 8.415104866027832, "learning_rate": 6.451612903225806e-06, "loss": 2.1262, "mean_token_accuracy": 0.5633999109268188, "step": 40 }, { "epoch": 0.06688417618270799, "grad_norm": 7.783365726470947, "learning_rate": 6.612903225806452e-06, "loss": 1.7869, "mean_token_accuracy": 0.6028110384941101, "step": 41 }, { "epoch": 0.06851549755301795, "grad_norm": 8.495488166809082, "learning_rate": 6.774193548387097e-06, "loss": 1.7062, "mean_token_accuracy": 0.6242873668670654, "step": 42 }, { "epoch": 0.0701468189233279, "grad_norm": 8.216286659240723, "learning_rate": 6.935483870967743e-06, "loss": 1.8002, "mean_token_accuracy": 0.6164383292198181, "step": 43 }, { "epoch": 0.07177814029363784, "grad_norm": 7.681854724884033, "learning_rate": 7.096774193548388e-06, "loss": 1.8663, "mean_token_accuracy": 0.5869767665863037, "step": 44 }, { "epoch": 0.0734094616639478, "grad_norm": 7.960548400878906, "learning_rate": 7.258064516129033e-06, "loss": 1.5801, "mean_token_accuracy": 0.6295210123062134, "step": 45 }, { "epoch": 0.07504078303425775, "grad_norm": 8.843791007995605, "learning_rate": 7.4193548387096784e-06, "loss": 1.9695, "mean_token_accuracy": 0.5767748951911926, "step": 46 }, { "epoch": 0.0766721044045677, "grad_norm": 7.562375068664551, "learning_rate": 7.580645161290323e-06, "loss": 1.8982, "mean_token_accuracy": 0.5856515169143677, "step": 47 }, { "epoch": 0.07830342577487764, "grad_norm": 7.976773738861084, "learning_rate": 7.741935483870968e-06, "loss": 1.8455, "mean_token_accuracy": 0.5857519507408142, "step": 48 }, { "epoch": 0.0799347471451876, "grad_norm": 7.795076847076416, "learning_rate": 7.903225806451613e-06, "loss": 1.738, "mean_token_accuracy": 0.602642297744751, "step": 49 }, { "epoch": 0.08156606851549755, "grad_norm": 9.113154411315918, "learning_rate": 8.064516129032258e-06, "loss": 1.7887, "mean_token_accuracy": 0.6150583028793335, "step": 50 }, { "epoch": 0.08319738988580751, "grad_norm": 9.503119468688965, "learning_rate": 8.225806451612904e-06, "loss": 1.6738, "mean_token_accuracy": 0.6308540105819702, "step": 51 }, { "epoch": 0.08482871125611746, "grad_norm": 7.7233757972717285, "learning_rate": 8.387096774193549e-06, "loss": 1.8524, "mean_token_accuracy": 0.6068170666694641, "step": 52 }, { "epoch": 0.0864600326264274, "grad_norm": 8.368830680847168, "learning_rate": 8.548387096774194e-06, "loss": 1.6863, "mean_token_accuracy": 0.641238272190094, "step": 53 }, { "epoch": 0.08809135399673736, "grad_norm": 8.289685249328613, "learning_rate": 8.70967741935484e-06, "loss": 1.7527, "mean_token_accuracy": 0.6219838857650757, "step": 54 }, { "epoch": 0.08972267536704731, "grad_norm": 8.580499649047852, "learning_rate": 8.870967741935484e-06, "loss": 1.7605, "mean_token_accuracy": 0.622188150882721, "step": 55 }, { "epoch": 0.09135399673735727, "grad_norm": 8.407153129577637, "learning_rate": 9.03225806451613e-06, "loss": 1.9015, "mean_token_accuracy": 0.6121242046356201, "step": 56 }, { "epoch": 0.0929853181076672, "grad_norm": 7.347232818603516, "learning_rate": 9.193548387096775e-06, "loss": 1.6066, "mean_token_accuracy": 0.6575052738189697, "step": 57 }, { "epoch": 0.09461663947797716, "grad_norm": 7.600398063659668, "learning_rate": 9.35483870967742e-06, "loss": 1.6309, "mean_token_accuracy": 0.6496000289916992, "step": 58 }, { "epoch": 0.09624796084828711, "grad_norm": 9.03729248046875, "learning_rate": 9.516129032258065e-06, "loss": 1.5208, "mean_token_accuracy": 0.6523297429084778, "step": 59 }, { "epoch": 0.09787928221859707, "grad_norm": 7.88900899887085, "learning_rate": 9.67741935483871e-06, "loss": 1.5696, "mean_token_accuracy": 0.6507083773612976, "step": 60 }, { "epoch": 0.09951060358890701, "grad_norm": 7.398552417755127, "learning_rate": 9.838709677419356e-06, "loss": 1.4991, "mean_token_accuracy": 0.6561679840087891, "step": 61 }, { "epoch": 0.10114192495921696, "grad_norm": 7.690386772155762, "learning_rate": 1e-05, "loss": 1.4677, "mean_token_accuracy": 0.6609534025192261, "step": 62 }, { "epoch": 0.10277324632952692, "grad_norm": 7.935258865356445, "learning_rate": 9.999926856137682e-06, "loss": 1.5293, "mean_token_accuracy": 0.6509740352630615, "step": 63 }, { "epoch": 0.10440456769983687, "grad_norm": 7.435649871826172, "learning_rate": 9.999707426928513e-06, "loss": 1.5408, "mean_token_accuracy": 0.6423665881156921, "step": 64 }, { "epoch": 0.10603588907014681, "grad_norm": 7.0717668533325195, "learning_rate": 9.999341719505784e-06, "loss": 1.2598, "mean_token_accuracy": 0.7105831503868103, "step": 65 }, { "epoch": 0.10766721044045677, "grad_norm": 7.5760722160339355, "learning_rate": 9.998829745758052e-06, "loss": 1.5635, "mean_token_accuracy": 0.6381751298904419, "step": 66 }, { "epoch": 0.10929853181076672, "grad_norm": 7.556014060974121, "learning_rate": 9.998171522328753e-06, "loss": 1.6741, "mean_token_accuracy": 0.6098901033401489, "step": 67 }, { "epoch": 0.11092985318107668, "grad_norm": 7.316895008087158, "learning_rate": 9.99736707061567e-06, "loss": 1.698, "mean_token_accuracy": 0.6228723526000977, "step": 68 }, { "epoch": 0.11256117455138662, "grad_norm": 8.193136215209961, "learning_rate": 9.996416416770227e-06, "loss": 1.6473, "mean_token_accuracy": 0.6394094228744507, "step": 69 }, { "epoch": 0.11419249592169657, "grad_norm": 6.792864799499512, "learning_rate": 9.995319591696643e-06, "loss": 1.6064, "mean_token_accuracy": 0.6287455558776855, "step": 70 }, { "epoch": 0.11582381729200653, "grad_norm": 7.596305847167969, "learning_rate": 9.994076631050926e-06, "loss": 1.8675, "mean_token_accuracy": 0.5812404155731201, "step": 71 }, { "epoch": 0.11745513866231648, "grad_norm": 6.764160633087158, "learning_rate": 9.99268757523972e-06, "loss": 1.5861, "mean_token_accuracy": 0.64697265625, "step": 72 }, { "epoch": 0.11908646003262642, "grad_norm": 7.583809852600098, "learning_rate": 9.991152469418984e-06, "loss": 1.3654, "mean_token_accuracy": 0.6922652125358582, "step": 73 }, { "epoch": 0.12071778140293637, "grad_norm": 7.365781307220459, "learning_rate": 9.989471363492523e-06, "loss": 1.6449, "mean_token_accuracy": 0.6340000033378601, "step": 74 }, { "epoch": 0.12234910277324633, "grad_norm": 7.349303722381592, "learning_rate": 9.987644312110373e-06, "loss": 1.7496, "mean_token_accuracy": 0.6141689419746399, "step": 75 }, { "epoch": 0.12398042414355628, "grad_norm": 6.4074273109436035, "learning_rate": 9.985671374667024e-06, "loss": 1.5874, "mean_token_accuracy": 0.6464434862136841, "step": 76 }, { "epoch": 0.12561174551386622, "grad_norm": 6.483602046966553, "learning_rate": 9.98355261529948e-06, "loss": 1.6916, "mean_token_accuracy": 0.6172904372215271, "step": 77 }, { "epoch": 0.1272430668841762, "grad_norm": 6.887275695800781, "learning_rate": 9.981288102885185e-06, "loss": 1.6873, "mean_token_accuracy": 0.6121962666511536, "step": 78 }, { "epoch": 0.12887438825448613, "grad_norm": 6.4050703048706055, "learning_rate": 9.978877911039772e-06, "loss": 1.4187, "mean_token_accuracy": 0.6751824617385864, "step": 79 }, { "epoch": 0.13050570962479607, "grad_norm": 6.44724178314209, "learning_rate": 9.976322118114685e-06, "loss": 1.4161, "mean_token_accuracy": 0.6592556834220886, "step": 80 }, { "epoch": 0.13213703099510604, "grad_norm": 5.995436668395996, "learning_rate": 9.97362080719462e-06, "loss": 1.3907, "mean_token_accuracy": 0.6656084656715393, "step": 81 }, { "epoch": 0.13376835236541598, "grad_norm": 6.501825332641602, "learning_rate": 9.970774066094825e-06, "loss": 1.6026, "mean_token_accuracy": 0.603732168674469, "step": 82 }, { "epoch": 0.13539967373572595, "grad_norm": 7.173989772796631, "learning_rate": 9.967781987358252e-06, "loss": 1.7378, "mean_token_accuracy": 0.6143959164619446, "step": 83 }, { "epoch": 0.1370309951060359, "grad_norm": 6.576292991638184, "learning_rate": 9.964644668252544e-06, "loss": 1.4204, "mean_token_accuracy": 0.6584976315498352, "step": 84 }, { "epoch": 0.13866231647634583, "grad_norm": 8.727774620056152, "learning_rate": 9.961362210766871e-06, "loss": 1.6993, "mean_token_accuracy": 0.6126176118850708, "step": 85 }, { "epoch": 0.1402936378466558, "grad_norm": 6.580403804779053, "learning_rate": 9.957934721608621e-06, "loss": 1.6845, "mean_token_accuracy": 0.6215676665306091, "step": 86 }, { "epoch": 0.14192495921696574, "grad_norm": 5.9920830726623535, "learning_rate": 9.954362312199926e-06, "loss": 1.3893, "mean_token_accuracy": 0.6767676472663879, "step": 87 }, { "epoch": 0.14355628058727568, "grad_norm": 5.893803119659424, "learning_rate": 9.950645098674037e-06, "loss": 1.4447, "mean_token_accuracy": 0.6626806259155273, "step": 88 }, { "epoch": 0.14518760195758565, "grad_norm": 6.5982770919799805, "learning_rate": 9.946783201871558e-06, "loss": 1.3436, "mean_token_accuracy": 0.6762666702270508, "step": 89 }, { "epoch": 0.1468189233278956, "grad_norm": 5.981234550476074, "learning_rate": 9.942776747336509e-06, "loss": 1.5784, "mean_token_accuracy": 0.6174784898757935, "step": 90 }, { "epoch": 0.14845024469820556, "grad_norm": 6.088432788848877, "learning_rate": 9.938625865312252e-06, "loss": 1.7807, "mean_token_accuracy": 0.5808597803115845, "step": 91 }, { "epoch": 0.1500815660685155, "grad_norm": 6.743659973144531, "learning_rate": 9.934330690737247e-06, "loss": 1.6376, "mean_token_accuracy": 0.604613721370697, "step": 92 }, { "epoch": 0.15171288743882544, "grad_norm": 5.764866828918457, "learning_rate": 9.929891363240679e-06, "loss": 1.6292, "mean_token_accuracy": 0.6264821887016296, "step": 93 }, { "epoch": 0.1533442088091354, "grad_norm": 5.750985622406006, "learning_rate": 9.925308027137906e-06, "loss": 1.3667, "mean_token_accuracy": 0.6758104562759399, "step": 94 }, { "epoch": 0.15497553017944535, "grad_norm": 5.635873317718506, "learning_rate": 9.920580831425774e-06, "loss": 1.442, "mean_token_accuracy": 0.6777954697608948, "step": 95 }, { "epoch": 0.1566068515497553, "grad_norm": 5.207980632781982, "learning_rate": 9.915709929777773e-06, "loss": 1.1315, "mean_token_accuracy": 0.7171201705932617, "step": 96 }, { "epoch": 0.15823817292006526, "grad_norm": 6.929599761962891, "learning_rate": 9.910695480539043e-06, "loss": 1.5498, "mean_token_accuracy": 0.6462904810905457, "step": 97 }, { "epoch": 0.1598694942903752, "grad_norm": 6.597740173339844, "learning_rate": 9.905537646721215e-06, "loss": 1.3707, "mean_token_accuracy": 0.6714513301849365, "step": 98 }, { "epoch": 0.16150081566068517, "grad_norm": 5.562872409820557, "learning_rate": 9.900236595997138e-06, "loss": 1.2183, "mean_token_accuracy": 0.709775984287262, "step": 99 }, { "epoch": 0.1631321370309951, "grad_norm": 5.840291976928711, "learning_rate": 9.89479250069539e-06, "loss": 1.2124, "mean_token_accuracy": 0.7145169973373413, "step": 100 }, { "epoch": 0.16476345840130505, "grad_norm": 5.99063777923584, "learning_rate": 9.889205537794715e-06, "loss": 1.3492, "mean_token_accuracy": 0.6756311655044556, "step": 101 }, { "epoch": 0.16639477977161501, "grad_norm": 6.224008560180664, "learning_rate": 9.883475888918241e-06, "loss": 1.2016, "mean_token_accuracy": 0.7054827809333801, "step": 102 }, { "epoch": 0.16802610114192496, "grad_norm": 5.562602519989014, "learning_rate": 9.87760374032759e-06, "loss": 1.5352, "mean_token_accuracy": 0.6521076560020447, "step": 103 }, { "epoch": 0.16965742251223492, "grad_norm": 5.726022243499756, "learning_rate": 9.87158928291682e-06, "loss": 1.3858, "mean_token_accuracy": 0.6717791557312012, "step": 104 }, { "epoch": 0.17128874388254486, "grad_norm": 6.054457664489746, "learning_rate": 9.865432712206215e-06, "loss": 1.6255, "mean_token_accuracy": 0.6333163976669312, "step": 105 }, { "epoch": 0.1729200652528548, "grad_norm": 5.757321357727051, "learning_rate": 9.859134228335937e-06, "loss": 1.3847, "mean_token_accuracy": 0.6641345620155334, "step": 106 }, { "epoch": 0.17455138662316477, "grad_norm": 5.4531450271606445, "learning_rate": 9.852694036059514e-06, "loss": 1.4778, "mean_token_accuracy": 0.680861234664917, "step": 107 }, { "epoch": 0.1761827079934747, "grad_norm": 6.217274188995361, "learning_rate": 9.846112344737182e-06, "loss": 1.3624, "mean_token_accuracy": 0.6645264625549316, "step": 108 }, { "epoch": 0.17781402936378465, "grad_norm": 5.447512626647949, "learning_rate": 9.839389368329088e-06, "loss": 1.5179, "mean_token_accuracy": 0.6528394818305969, "step": 109 }, { "epoch": 0.17944535073409462, "grad_norm": 6.115851402282715, "learning_rate": 9.832525325388326e-06, "loss": 1.6997, "mean_token_accuracy": 0.6170212626457214, "step": 110 }, { "epoch": 0.18107667210440456, "grad_norm": 5.800912857055664, "learning_rate": 9.825520439053832e-06, "loss": 1.4313, "mean_token_accuracy": 0.6626384854316711, "step": 111 }, { "epoch": 0.18270799347471453, "grad_norm": 6.369785785675049, "learning_rate": 9.818374937043138e-06, "loss": 1.5534, "mean_token_accuracy": 0.6290909051895142, "step": 112 }, { "epoch": 0.18433931484502447, "grad_norm": 6.613420009613037, "learning_rate": 9.811089051644959e-06, "loss": 1.6318, "mean_token_accuracy": 0.6186726689338684, "step": 113 }, { "epoch": 0.1859706362153344, "grad_norm": 5.590596675872803, "learning_rate": 9.803663019711654e-06, "loss": 1.3043, "mean_token_accuracy": 0.6894215941429138, "step": 114 }, { "epoch": 0.18760195758564438, "grad_norm": 6.427780628204346, "learning_rate": 9.796097082651511e-06, "loss": 1.6446, "mean_token_accuracy": 0.6234225034713745, "step": 115 }, { "epoch": 0.18923327895595432, "grad_norm": 6.452088356018066, "learning_rate": 9.788391486420914e-06, "loss": 1.4595, "mean_token_accuracy": 0.6346368789672852, "step": 116 }, { "epoch": 0.19086460032626426, "grad_norm": 5.884222984313965, "learning_rate": 9.780546481516338e-06, "loss": 1.3437, "mean_token_accuracy": 0.6792058348655701, "step": 117 }, { "epoch": 0.19249592169657423, "grad_norm": 5.718683242797852, "learning_rate": 9.772562322966209e-06, "loss": 1.2696, "mean_token_accuracy": 0.6850185394287109, "step": 118 }, { "epoch": 0.19412724306688417, "grad_norm": 5.645365238189697, "learning_rate": 9.764439270322612e-06, "loss": 1.5184, "mean_token_accuracy": 0.6474390625953674, "step": 119 }, { "epoch": 0.19575856443719414, "grad_norm": 5.762539386749268, "learning_rate": 9.756177587652857e-06, "loss": 1.4345, "mean_token_accuracy": 0.6544578075408936, "step": 120 }, { "epoch": 0.19738988580750408, "grad_norm": 5.77543306350708, "learning_rate": 9.74777754353089e-06, "loss": 1.7153, "mean_token_accuracy": 0.6169678568840027, "step": 121 }, { "epoch": 0.19902120717781402, "grad_norm": 5.565819263458252, "learning_rate": 9.739239411028565e-06, "loss": 1.3033, "mean_token_accuracy": 0.6986506581306458, "step": 122 }, { "epoch": 0.200652528548124, "grad_norm": 5.547922134399414, "learning_rate": 9.730563467706765e-06, "loss": 1.327, "mean_token_accuracy": 0.683811604976654, "step": 123 }, { "epoch": 0.20228384991843393, "grad_norm": 5.765176296234131, "learning_rate": 9.721749995606381e-06, "loss": 1.3776, "mean_token_accuracy": 0.6884735226631165, "step": 124 }, { "epoch": 0.2039151712887439, "grad_norm": 5.340542793273926, "learning_rate": 9.712799281239142e-06, "loss": 1.4246, "mean_token_accuracy": 0.6791791915893555, "step": 125 }, { "epoch": 0.20554649265905384, "grad_norm": 5.423886775970459, "learning_rate": 9.703711615578301e-06, "loss": 1.1438, "mean_token_accuracy": 0.7353861927986145, "step": 126 }, { "epoch": 0.20717781402936378, "grad_norm": 5.641276836395264, "learning_rate": 9.694487294049174e-06, "loss": 1.4128, "mean_token_accuracy": 0.6514989137649536, "step": 127 }, { "epoch": 0.20880913539967375, "grad_norm": 5.543446063995361, "learning_rate": 9.685126616519545e-06, "loss": 1.4135, "mean_token_accuracy": 0.6586325764656067, "step": 128 }, { "epoch": 0.21044045676998369, "grad_norm": 6.770927906036377, "learning_rate": 9.675629887289904e-06, "loss": 1.4884, "mean_token_accuracy": 0.6546052694320679, "step": 129 }, { "epoch": 0.21207177814029363, "grad_norm": 5.887889385223389, "learning_rate": 9.665997415083565e-06, "loss": 1.4939, "mean_token_accuracy": 0.653674840927124, "step": 130 }, { "epoch": 0.2137030995106036, "grad_norm": 5.511849880218506, "learning_rate": 9.656229513036623e-06, "loss": 1.2267, "mean_token_accuracy": 0.7116374969482422, "step": 131 }, { "epoch": 0.21533442088091354, "grad_norm": 5.637845039367676, "learning_rate": 9.646326498687787e-06, "loss": 1.5632, "mean_token_accuracy": 0.6471421718597412, "step": 132 }, { "epoch": 0.2169657422512235, "grad_norm": 5.33619499206543, "learning_rate": 9.636288693968039e-06, "loss": 1.4464, "mean_token_accuracy": 0.656867265701294, "step": 133 }, { "epoch": 0.21859706362153344, "grad_norm": 5.903771877288818, "learning_rate": 9.626116425190182e-06, "loss": 1.5197, "mean_token_accuracy": 0.6431440114974976, "step": 134 }, { "epoch": 0.22022838499184338, "grad_norm": 5.29071569442749, "learning_rate": 9.615810023038228e-06, "loss": 1.4022, "mean_token_accuracy": 0.646789014339447, "step": 135 }, { "epoch": 0.22185970636215335, "grad_norm": 5.770832538604736, "learning_rate": 9.605369822556651e-06, "loss": 1.3488, "mean_token_accuracy": 0.672672688961029, "step": 136 }, { "epoch": 0.2234910277324633, "grad_norm": 5.827826023101807, "learning_rate": 9.594796163139487e-06, "loss": 1.2913, "mean_token_accuracy": 0.707563042640686, "step": 137 }, { "epoch": 0.22512234910277323, "grad_norm": 6.449001312255859, "learning_rate": 9.584089388519307e-06, "loss": 1.6024, "mean_token_accuracy": 0.6305343508720398, "step": 138 }, { "epoch": 0.2267536704730832, "grad_norm": 5.251701831817627, "learning_rate": 9.573249846756048e-06, "loss": 1.4945, "mean_token_accuracy": 0.6551724076271057, "step": 139 }, { "epoch": 0.22838499184339314, "grad_norm": 5.719169616699219, "learning_rate": 9.562277890225683e-06, "loss": 1.4588, "mean_token_accuracy": 0.6551551818847656, "step": 140 }, { "epoch": 0.2300163132137031, "grad_norm": 5.2488226890563965, "learning_rate": 9.551173875608785e-06, "loss": 1.235, "mean_token_accuracy": 0.6981236338615417, "step": 141 }, { "epoch": 0.23164763458401305, "grad_norm": 5.853959083557129, "learning_rate": 9.539938163878916e-06, "loss": 1.3501, "mean_token_accuracy": 0.6693121790885925, "step": 142 }, { "epoch": 0.233278955954323, "grad_norm": 5.647499084472656, "learning_rate": 9.528571120290894e-06, "loss": 1.2444, "mean_token_accuracy": 0.7117318511009216, "step": 143 }, { "epoch": 0.23491027732463296, "grad_norm": 5.933478832244873, "learning_rate": 9.517073114368933e-06, "loss": 1.4919, "mean_token_accuracy": 0.6552088856697083, "step": 144 }, { "epoch": 0.2365415986949429, "grad_norm": 5.842235565185547, "learning_rate": 9.505444519894616e-06, "loss": 1.52, "mean_token_accuracy": 0.6385658979415894, "step": 145 }, { "epoch": 0.23817292006525284, "grad_norm": 6.486652374267578, "learning_rate": 9.493685714894746e-06, "loss": 1.1983, "mean_token_accuracy": 0.7004634737968445, "step": 146 }, { "epoch": 0.2398042414355628, "grad_norm": 4.8720245361328125, "learning_rate": 9.481797081629068e-06, "loss": 1.3004, "mean_token_accuracy": 0.709541380405426, "step": 147 }, { "epoch": 0.24143556280587275, "grad_norm": 5.410114288330078, "learning_rate": 9.469779006577822e-06, "loss": 1.2591, "mean_token_accuracy": 0.690431535243988, "step": 148 }, { "epoch": 0.24306688417618272, "grad_norm": 5.812628746032715, "learning_rate": 9.4576318804292e-06, "loss": 1.612, "mean_token_accuracy": 0.6232700943946838, "step": 149 }, { "epoch": 0.24469820554649266, "grad_norm": 6.259674072265625, "learning_rate": 9.445356098066638e-06, "loss": 1.3041, "mean_token_accuracy": 0.6718587875366211, "step": 150 }, { "epoch": 0.2463295269168026, "grad_norm": 6.436178207397461, "learning_rate": 9.43295205855597e-06, "loss": 1.6111, "mean_token_accuracy": 0.6207820177078247, "step": 151 }, { "epoch": 0.24796084828711257, "grad_norm": 5.527941703796387, "learning_rate": 9.420420165132466e-06, "loss": 1.6642, "mean_token_accuracy": 0.6238217949867249, "step": 152 }, { "epoch": 0.2495921696574225, "grad_norm": 5.792147159576416, "learning_rate": 9.407760825187722e-06, "loss": 1.4365, "mean_token_accuracy": 0.6555671095848083, "step": 153 }, { "epoch": 0.25122349102773245, "grad_norm": 5.005126953125, "learning_rate": 9.39497445025641e-06, "loss": 1.2446, "mean_token_accuracy": 0.7050209045410156, "step": 154 }, { "epoch": 0.2528548123980424, "grad_norm": 5.894453048706055, "learning_rate": 9.38206145600291e-06, "loss": 1.5225, "mean_token_accuracy": 0.6514018774032593, "step": 155 }, { "epoch": 0.2544861337683524, "grad_norm": 5.637172698974609, "learning_rate": 9.369022262207788e-06, "loss": 1.5141, "mean_token_accuracy": 0.622454047203064, "step": 156 }, { "epoch": 0.2561174551386623, "grad_norm": 5.716491222381592, "learning_rate": 9.355857292754152e-06, "loss": 1.5215, "mean_token_accuracy": 0.6571729779243469, "step": 157 }, { "epoch": 0.25774877650897227, "grad_norm": 6.088312149047852, "learning_rate": 9.342566975613875e-06, "loss": 1.5606, "mean_token_accuracy": 0.6172152161598206, "step": 158 }, { "epoch": 0.25938009787928223, "grad_norm": 6.6313796043396, "learning_rate": 9.329151742833678e-06, "loss": 1.261, "mean_token_accuracy": 0.6948052048683167, "step": 159 }, { "epoch": 0.26101141924959215, "grad_norm": 6.572261333465576, "learning_rate": 9.315612030521091e-06, "loss": 1.174, "mean_token_accuracy": 0.7152777910232544, "step": 160 }, { "epoch": 0.2626427406199021, "grad_norm": 6.0583882331848145, "learning_rate": 9.301948278830273e-06, "loss": 1.4, "mean_token_accuracy": 0.6757156848907471, "step": 161 }, { "epoch": 0.2642740619902121, "grad_norm": 5.715542316436768, "learning_rate": 9.288160931947698e-06, "loss": 1.3266, "mean_token_accuracy": 0.6793855428695679, "step": 162 }, { "epoch": 0.265905383360522, "grad_norm": 5.376319408416748, "learning_rate": 9.274250438077724e-06, "loss": 1.1109, "mean_token_accuracy": 0.7322580814361572, "step": 163 }, { "epoch": 0.26753670473083196, "grad_norm": 5.3145012855529785, "learning_rate": 9.260217249428016e-06, "loss": 1.1862, "mean_token_accuracy": 0.7048360109329224, "step": 164 }, { "epoch": 0.26916802610114193, "grad_norm": 6.1805338859558105, "learning_rate": 9.246061822194849e-06, "loss": 1.5489, "mean_token_accuracy": 0.6458333134651184, "step": 165 }, { "epoch": 0.2707993474714519, "grad_norm": 5.672875881195068, "learning_rate": 9.231784616548277e-06, "loss": 1.3288, "mean_token_accuracy": 0.6853932738304138, "step": 166 }, { "epoch": 0.2724306688417618, "grad_norm": 5.999112606048584, "learning_rate": 9.217386096617175e-06, "loss": 1.5361, "mean_token_accuracy": 0.6438902616500854, "step": 167 }, { "epoch": 0.2740619902120718, "grad_norm": 6.415194511413574, "learning_rate": 9.202866730474143e-06, "loss": 1.5405, "mean_token_accuracy": 0.6401821970939636, "step": 168 }, { "epoch": 0.27569331158238175, "grad_norm": 6.119101524353027, "learning_rate": 9.188226990120303e-06, "loss": 1.4685, "mean_token_accuracy": 0.6468571424484253, "step": 169 }, { "epoch": 0.27732463295269166, "grad_norm": 5.0899434089660645, "learning_rate": 9.173467351469943e-06, "loss": 1.1837, "mean_token_accuracy": 0.7153804302215576, "step": 170 }, { "epoch": 0.27895595432300163, "grad_norm": 5.665865421295166, "learning_rate": 9.158588294335055e-06, "loss": 1.271, "mean_token_accuracy": 0.6892816424369812, "step": 171 }, { "epoch": 0.2805872756933116, "grad_norm": 5.781040668487549, "learning_rate": 9.14359030240973e-06, "loss": 1.1938, "mean_token_accuracy": 0.7185488343238831, "step": 172 }, { "epoch": 0.2822185970636215, "grad_norm": 4.997267723083496, "learning_rate": 9.128473863254438e-06, "loss": 1.2519, "mean_token_accuracy": 0.6875, "step": 173 }, { "epoch": 0.2838499184339315, "grad_norm": 5.392592906951904, "learning_rate": 9.113239468280175e-06, "loss": 1.5819, "mean_token_accuracy": 0.6332082748413086, "step": 174 }, { "epoch": 0.28548123980424145, "grad_norm": 4.405828952789307, "learning_rate": 9.097887612732495e-06, "loss": 0.9685, "mean_token_accuracy": 0.7657608985900879, "step": 175 }, { "epoch": 0.28711256117455136, "grad_norm": 4.870915412902832, "learning_rate": 9.082418795675397e-06, "loss": 1.2698, "mean_token_accuracy": 0.7014712691307068, "step": 176 }, { "epoch": 0.28874388254486133, "grad_norm": 5.485860824584961, "learning_rate": 9.066833519975118e-06, "loss": 1.3616, "mean_token_accuracy": 0.6694870591163635, "step": 177 }, { "epoch": 0.2903752039151713, "grad_norm": 5.251032829284668, "learning_rate": 9.051132292283772e-06, "loss": 1.1863, "mean_token_accuracy": 0.6943209767341614, "step": 178 }, { "epoch": 0.29200652528548127, "grad_norm": 5.481298923492432, "learning_rate": 9.035315623022886e-06, "loss": 1.3581, "mean_token_accuracy": 0.6696730256080627, "step": 179 }, { "epoch": 0.2936378466557912, "grad_norm": 5.111570358276367, "learning_rate": 9.019384026366807e-06, "loss": 1.3505, "mean_token_accuracy": 0.6688086986541748, "step": 180 }, { "epoch": 0.29526916802610115, "grad_norm": 4.826779842376709, "learning_rate": 9.003338020225986e-06, "loss": 1.1635, "mean_token_accuracy": 0.7186034321784973, "step": 181 }, { "epoch": 0.2969004893964111, "grad_norm": 5.660580635070801, "learning_rate": 8.987178126230138e-06, "loss": 1.5801, "mean_token_accuracy": 0.6331775784492493, "step": 182 }, { "epoch": 0.29853181076672103, "grad_norm": 5.761633396148682, "learning_rate": 8.97090486971129e-06, "loss": 1.1748, "mean_token_accuracy": 0.7208150029182434, "step": 183 }, { "epoch": 0.300163132137031, "grad_norm": 5.576194763183594, "learning_rate": 8.954518779686704e-06, "loss": 1.4442, "mean_token_accuracy": 0.6586382389068604, "step": 184 }, { "epoch": 0.30179445350734097, "grad_norm": 5.576228618621826, "learning_rate": 8.938020388841673e-06, "loss": 1.3454, "mean_token_accuracy": 0.6765140295028687, "step": 185 }, { "epoch": 0.3034257748776509, "grad_norm": 4.994912624359131, "learning_rate": 8.921410233512211e-06, "loss": 1.24, "mean_token_accuracy": 0.7072243094444275, "step": 186 }, { "epoch": 0.30505709624796085, "grad_norm": 5.298640251159668, "learning_rate": 8.904688853667612e-06, "loss": 1.3136, "mean_token_accuracy": 0.6705882549285889, "step": 187 }, { "epoch": 0.3066884176182708, "grad_norm": 5.550191879272461, "learning_rate": 8.887856792892902e-06, "loss": 1.3868, "mean_token_accuracy": 0.6856528520584106, "step": 188 }, { "epoch": 0.3083197389885807, "grad_norm": 5.478514671325684, "learning_rate": 8.87091459837116e-06, "loss": 1.2973, "mean_token_accuracy": 0.6864721775054932, "step": 189 }, { "epoch": 0.3099510603588907, "grad_norm": 5.3640546798706055, "learning_rate": 8.853862820865742e-06, "loss": 1.4836, "mean_token_accuracy": 0.6382033824920654, "step": 190 }, { "epoch": 0.31158238172920066, "grad_norm": 4.50584077835083, "learning_rate": 8.83670201470237e-06, "loss": 1.0835, "mean_token_accuracy": 0.7182095646858215, "step": 191 }, { "epoch": 0.3132137030995106, "grad_norm": 5.293252944946289, "learning_rate": 8.819432737751097e-06, "loss": 1.2622, "mean_token_accuracy": 0.6940993666648865, "step": 192 }, { "epoch": 0.31484502446982054, "grad_norm": 4.696035861968994, "learning_rate": 8.802055551408207e-06, "loss": 1.189, "mean_token_accuracy": 0.7159493565559387, "step": 193 }, { "epoch": 0.3164763458401305, "grad_norm": 4.758869171142578, "learning_rate": 8.784571020577926e-06, "loss": 1.0363, "mean_token_accuracy": 0.7414075136184692, "step": 194 }, { "epoch": 0.3181076672104405, "grad_norm": 5.393585681915283, "learning_rate": 8.76697971365409e-06, "loss": 1.3754, "mean_token_accuracy": 0.6663179993629456, "step": 195 }, { "epoch": 0.3197389885807504, "grad_norm": 5.480104446411133, "learning_rate": 8.74928220250164e-06, "loss": 1.7055, "mean_token_accuracy": 0.6046082973480225, "step": 196 }, { "epoch": 0.32137030995106036, "grad_norm": 5.184609413146973, "learning_rate": 8.731479062438056e-06, "loss": 1.4335, "mean_token_accuracy": 0.6592000126838684, "step": 197 }, { "epoch": 0.32300163132137033, "grad_norm": 5.132387638092041, "learning_rate": 8.713570872214637e-06, "loss": 1.4172, "mean_token_accuracy": 0.6633475422859192, "step": 198 }, { "epoch": 0.32463295269168024, "grad_norm": 5.561227798461914, "learning_rate": 8.695558213997692e-06, "loss": 1.5116, "mean_token_accuracy": 0.6382217407226562, "step": 199 }, { "epoch": 0.3262642740619902, "grad_norm": 6.255463123321533, "learning_rate": 8.677441673349622e-06, "loss": 1.3863, "mean_token_accuracy": 0.6630803942680359, "step": 200 }, { "epoch": 0.3278955954323002, "grad_norm": 4.947396755218506, "learning_rate": 8.659221839209869e-06, "loss": 1.4143, "mean_token_accuracy": 0.6645483374595642, "step": 201 }, { "epoch": 0.3295269168026101, "grad_norm": 5.235170364379883, "learning_rate": 8.640899303875785e-06, "loss": 1.2793, "mean_token_accuracy": 0.6936695575714111, "step": 202 }, { "epoch": 0.33115823817292006, "grad_norm": 5.727679252624512, "learning_rate": 8.622474662983372e-06, "loss": 1.428, "mean_token_accuracy": 0.6479238867759705, "step": 203 }, { "epoch": 0.33278955954323003, "grad_norm": 5.557906627655029, "learning_rate": 8.60394851548792e-06, "loss": 1.3305, "mean_token_accuracy": 0.6868632435798645, "step": 204 }, { "epoch": 0.33442088091353994, "grad_norm": 5.403807640075684, "learning_rate": 8.585321463644525e-06, "loss": 1.3701, "mean_token_accuracy": 0.6680100560188293, "step": 205 }, { "epoch": 0.3360522022838499, "grad_norm": 5.334835052490234, "learning_rate": 8.566594112988534e-06, "loss": 1.3598, "mean_token_accuracy": 0.6583541035652161, "step": 206 }, { "epoch": 0.3376835236541599, "grad_norm": 4.983403205871582, "learning_rate": 8.547767072315835e-06, "loss": 1.2434, "mean_token_accuracy": 0.6838777661323547, "step": 207 }, { "epoch": 0.33931484502446985, "grad_norm": 5.587502956390381, "learning_rate": 8.528840953663086e-06, "loss": 1.3061, "mean_token_accuracy": 0.688642680644989, "step": 208 }, { "epoch": 0.34094616639477976, "grad_norm": 5.853117942810059, "learning_rate": 8.5098163722878e-06, "loss": 1.4633, "mean_token_accuracy": 0.6635462641716003, "step": 209 }, { "epoch": 0.3425774877650897, "grad_norm": 5.541942596435547, "learning_rate": 8.490693946648364e-06, "loss": 1.2622, "mean_token_accuracy": 0.7057894468307495, "step": 210 }, { "epoch": 0.3442088091353997, "grad_norm": 5.35739278793335, "learning_rate": 8.47147429838392e-06, "loss": 1.2618, "mean_token_accuracy": 0.689638078212738, "step": 211 }, { "epoch": 0.3458401305057096, "grad_norm": 5.423904895782471, "learning_rate": 8.452158052294158e-06, "loss": 1.5032, "mean_token_accuracy": 0.6418230533599854, "step": 212 }, { "epoch": 0.3474714518760196, "grad_norm": 4.8785223960876465, "learning_rate": 8.432745836319007e-06, "loss": 1.4344, "mean_token_accuracy": 0.6615913510322571, "step": 213 }, { "epoch": 0.34910277324632955, "grad_norm": 4.893246650695801, "learning_rate": 8.413238281518225e-06, "loss": 1.2007, "mean_token_accuracy": 0.6991991996765137, "step": 214 }, { "epoch": 0.35073409461663946, "grad_norm": 5.7973504066467285, "learning_rate": 8.39363602205088e-06, "loss": 1.5249, "mean_token_accuracy": 0.6353210806846619, "step": 215 }, { "epoch": 0.3523654159869494, "grad_norm": 5.406508922576904, "learning_rate": 8.373939695154739e-06, "loss": 1.2806, "mean_token_accuracy": 0.6916395425796509, "step": 216 }, { "epoch": 0.3539967373572594, "grad_norm": 4.771231174468994, "learning_rate": 8.354149941125539e-06, "loss": 1.1256, "mean_token_accuracy": 0.7322953343391418, "step": 217 }, { "epoch": 0.3556280587275693, "grad_norm": 5.047488689422607, "learning_rate": 8.334267403296193e-06, "loss": 1.1106, "mean_token_accuracy": 0.7239696383476257, "step": 218 }, { "epoch": 0.3572593800978793, "grad_norm": 5.410397529602051, "learning_rate": 8.314292728015859e-06, "loss": 1.182, "mean_token_accuracy": 0.7058823704719543, "step": 219 }, { "epoch": 0.35889070146818924, "grad_norm": 6.237778663635254, "learning_rate": 8.294226564628936e-06, "loss": 1.2493, "mean_token_accuracy": 0.6834862232208252, "step": 220 }, { "epoch": 0.3605220228384992, "grad_norm": 5.143507957458496, "learning_rate": 8.274069565453955e-06, "loss": 1.352, "mean_token_accuracy": 0.6808404326438904, "step": 221 }, { "epoch": 0.3621533442088091, "grad_norm": 5.389186859130859, "learning_rate": 8.25382238576237e-06, "loss": 1.2109, "mean_token_accuracy": 0.7188329100608826, "step": 222 }, { "epoch": 0.3637846655791191, "grad_norm": 5.256932735443115, "learning_rate": 8.23348568375726e-06, "loss": 1.3621, "mean_token_accuracy": 0.679024875164032, "step": 223 }, { "epoch": 0.36541598694942906, "grad_norm": 5.2731146812438965, "learning_rate": 8.213060120551923e-06, "loss": 1.4888, "mean_token_accuracy": 0.644489586353302, "step": 224 }, { "epoch": 0.367047308319739, "grad_norm": 5.008488655090332, "learning_rate": 8.1925463601484e-06, "loss": 1.3388, "mean_token_accuracy": 0.6928645372390747, "step": 225 }, { "epoch": 0.36867862969004894, "grad_norm": 6.0909247398376465, "learning_rate": 8.171945069415877e-06, "loss": 1.3308, "mean_token_accuracy": 0.6703540086746216, "step": 226 }, { "epoch": 0.3703099510603589, "grad_norm": 6.270472526550293, "learning_rate": 8.151256918069002e-06, "loss": 1.5142, "mean_token_accuracy": 0.6341871023178101, "step": 227 }, { "epoch": 0.3719412724306688, "grad_norm": 5.570935249328613, "learning_rate": 8.130482578646137e-06, "loss": 1.1315, "mean_token_accuracy": 0.7041916251182556, "step": 228 }, { "epoch": 0.3735725938009788, "grad_norm": 5.195607662200928, "learning_rate": 8.109622726487463e-06, "loss": 1.54, "mean_token_accuracy": 0.6397637724876404, "step": 229 }, { "epoch": 0.37520391517128876, "grad_norm": 4.792831897735596, "learning_rate": 8.088678039713052e-06, "loss": 1.2567, "mean_token_accuracy": 0.7066537141799927, "step": 230 }, { "epoch": 0.3768352365415987, "grad_norm": 5.558446407318115, "learning_rate": 8.067649199200807e-06, "loss": 1.3282, "mean_token_accuracy": 0.6886616945266724, "step": 231 }, { "epoch": 0.37846655791190864, "grad_norm": 5.962700366973877, "learning_rate": 8.046536888564335e-06, "loss": 1.2761, "mean_token_accuracy": 0.6899516582489014, "step": 232 }, { "epoch": 0.3800978792822186, "grad_norm": 4.565369129180908, "learning_rate": 8.025341794130722e-06, "loss": 1.1489, "mean_token_accuracy": 0.7200378775596619, "step": 233 }, { "epoch": 0.3817292006525285, "grad_norm": 5.34097146987915, "learning_rate": 8.004064604918219e-06, "loss": 1.5369, "mean_token_accuracy": 0.6295350193977356, "step": 234 }, { "epoch": 0.3833605220228385, "grad_norm": 4.983196258544922, "learning_rate": 7.982706012613854e-06, "loss": 1.1661, "mean_token_accuracy": 0.6999412775039673, "step": 235 }, { "epoch": 0.38499184339314846, "grad_norm": 5.128100395202637, "learning_rate": 7.961266711550922e-06, "loss": 1.345, "mean_token_accuracy": 0.6874102354049683, "step": 236 }, { "epoch": 0.3866231647634584, "grad_norm": 5.386168479919434, "learning_rate": 7.939747398686445e-06, "loss": 1.3224, "mean_token_accuracy": 0.6796019673347473, "step": 237 }, { "epoch": 0.38825448613376834, "grad_norm": 5.458306312561035, "learning_rate": 7.918148773578492e-06, "loss": 1.4898, "mean_token_accuracy": 0.6451776623725891, "step": 238 }, { "epoch": 0.3898858075040783, "grad_norm": 5.1783552169799805, "learning_rate": 7.896471538363442e-06, "loss": 1.5354, "mean_token_accuracy": 0.6542155742645264, "step": 239 }, { "epoch": 0.3915171288743883, "grad_norm": 5.7401580810546875, "learning_rate": 7.874716397733172e-06, "loss": 1.4129, "mean_token_accuracy": 0.6713286638259888, "step": 240 }, { "epoch": 0.3931484502446982, "grad_norm": 5.0389180183410645, "learning_rate": 7.852884058912124e-06, "loss": 1.4643, "mean_token_accuracy": 0.6414728760719299, "step": 241 }, { "epoch": 0.39477977161500816, "grad_norm": 4.874699592590332, "learning_rate": 7.830975231634341e-06, "loss": 1.0325, "mean_token_accuracy": 0.740480363368988, "step": 242 }, { "epoch": 0.3964110929853181, "grad_norm": 4.744316101074219, "learning_rate": 7.808990628120374e-06, "loss": 1.154, "mean_token_accuracy": 0.7321428656578064, "step": 243 }, { "epoch": 0.39804241435562804, "grad_norm": 4.903902530670166, "learning_rate": 7.786930963054142e-06, "loss": 1.2538, "mean_token_accuracy": 0.6969696879386902, "step": 244 }, { "epoch": 0.399673735725938, "grad_norm": 4.861123085021973, "learning_rate": 7.76479695355969e-06, "loss": 1.1761, "mean_token_accuracy": 0.7024747133255005, "step": 245 }, { "epoch": 0.401305057096248, "grad_norm": 5.309647083282471, "learning_rate": 7.742589319177879e-06, "loss": 1.2522, "mean_token_accuracy": 0.7030481696128845, "step": 246 }, { "epoch": 0.4029363784665579, "grad_norm": 4.72802209854126, "learning_rate": 7.720308781843003e-06, "loss": 1.1986, "mean_token_accuracy": 0.7094155550003052, "step": 247 }, { "epoch": 0.40456769983686786, "grad_norm": 6.070117473602295, "learning_rate": 7.697956065859308e-06, "loss": 1.295, "mean_token_accuracy": 0.6842672228813171, "step": 248 }, { "epoch": 0.4061990212071778, "grad_norm": 4.879459857940674, "learning_rate": 7.67553189787745e-06, "loss": 1.2096, "mean_token_accuracy": 0.686804473400116, "step": 249 }, { "epoch": 0.4078303425774878, "grad_norm": 5.451211452484131, "learning_rate": 7.653037006870878e-06, "loss": 1.4763, "mean_token_accuracy": 0.637888491153717, "step": 250 }, { "epoch": 0.4094616639477977, "grad_norm": 4.923818588256836, "learning_rate": 7.630472124112125e-06, "loss": 1.2607, "mean_token_accuracy": 0.6872745752334595, "step": 251 }, { "epoch": 0.4110929853181077, "grad_norm": 5.415319442749023, "learning_rate": 7.607837983149057e-06, "loss": 1.1446, "mean_token_accuracy": 0.7222545146942139, "step": 252 }, { "epoch": 0.41272430668841764, "grad_norm": 5.4529900550842285, "learning_rate": 7.585135319780995e-06, "loss": 1.4468, "mean_token_accuracy": 0.6554580926895142, "step": 253 }, { "epoch": 0.41435562805872755, "grad_norm": 5.247809410095215, "learning_rate": 7.562364872034823e-06, "loss": 1.3883, "mean_token_accuracy": 0.6721068024635315, "step": 254 }, { "epoch": 0.4159869494290375, "grad_norm": 5.47812557220459, "learning_rate": 7.5395273801409854e-06, "loss": 1.4343, "mean_token_accuracy": 0.6608517169952393, "step": 255 }, { "epoch": 0.4176182707993475, "grad_norm": 5.498720645904541, "learning_rate": 7.5166235865094174e-06, "loss": 1.4222, "mean_token_accuracy": 0.6456736326217651, "step": 256 }, { "epoch": 0.4192495921696574, "grad_norm": 4.786160945892334, "learning_rate": 7.493654235705422e-06, "loss": 1.4204, "mean_token_accuracy": 0.6773132681846619, "step": 257 }, { "epoch": 0.42088091353996737, "grad_norm": 5.397915840148926, "learning_rate": 7.470620074425459e-06, "loss": 1.4843, "mean_token_accuracy": 0.6380900740623474, "step": 258 }, { "epoch": 0.42251223491027734, "grad_norm": 5.466760158538818, "learning_rate": 7.447521851472872e-06, "loss": 1.4852, "mean_token_accuracy": 0.6487154364585876, "step": 259 }, { "epoch": 0.42414355628058725, "grad_norm": 5.804627895355225, "learning_rate": 7.424360317733544e-06, "loss": 1.3923, "mean_token_accuracy": 0.6545741558074951, "step": 260 }, { "epoch": 0.4257748776508972, "grad_norm": 5.381365776062012, "learning_rate": 7.401136226151488e-06, "loss": 1.4495, "mean_token_accuracy": 0.6681222915649414, "step": 261 }, { "epoch": 0.4274061990212072, "grad_norm": 4.776740550994873, "learning_rate": 7.377850331704377e-06, "loss": 1.0082, "mean_token_accuracy": 0.7397812604904175, "step": 262 }, { "epoch": 0.4290375203915171, "grad_norm": 5.0946149826049805, "learning_rate": 7.354503391378992e-06, "loss": 1.1745, "mean_token_accuracy": 0.7127882838249207, "step": 263 }, { "epoch": 0.43066884176182707, "grad_norm": 5.161426067352295, "learning_rate": 7.331096164146616e-06, "loss": 1.4598, "mean_token_accuracy": 0.6507353186607361, "step": 264 }, { "epoch": 0.43230016313213704, "grad_norm": 5.14084005355835, "learning_rate": 7.307629410938364e-06, "loss": 1.3107, "mean_token_accuracy": 0.6751173734664917, "step": 265 }, { "epoch": 0.433931484502447, "grad_norm": 4.744462966918945, "learning_rate": 7.28410389462044e-06, "loss": 1.2483, "mean_token_accuracy": 0.6957618594169617, "step": 266 }, { "epoch": 0.4355628058727569, "grad_norm": 5.074441432952881, "learning_rate": 7.260520379969347e-06, "loss": 1.2429, "mean_token_accuracy": 0.7157652378082275, "step": 267 }, { "epoch": 0.4371941272430669, "grad_norm": 5.745429992675781, "learning_rate": 7.236879633647018e-06, "loss": 1.3938, "mean_token_accuracy": 0.6745472550392151, "step": 268 }, { "epoch": 0.43882544861337686, "grad_norm": 4.631476879119873, "learning_rate": 7.213182424175895e-06, "loss": 1.141, "mean_token_accuracy": 0.726822018623352, "step": 269 }, { "epoch": 0.44045676998368677, "grad_norm": 4.983883857727051, "learning_rate": 7.189429521913942e-06, "loss": 1.4304, "mean_token_accuracy": 0.656844973564148, "step": 270 }, { "epoch": 0.44208809135399674, "grad_norm": 5.129734039306641, "learning_rate": 7.165621699029615e-06, "loss": 1.2641, "mean_token_accuracy": 0.7042542099952698, "step": 271 }, { "epoch": 0.4437194127243067, "grad_norm": 5.031182765960693, "learning_rate": 7.1417597294767405e-06, "loss": 1.0971, "mean_token_accuracy": 0.7178630828857422, "step": 272 }, { "epoch": 0.4453507340946166, "grad_norm": 4.948353290557861, "learning_rate": 7.1178443889693694e-06, "loss": 0.9821, "mean_token_accuracy": 0.7603078484535217, "step": 273 }, { "epoch": 0.4469820554649266, "grad_norm": 5.791008472442627, "learning_rate": 7.0938764549565605e-06, "loss": 1.3631, "mean_token_accuracy": 0.6715368032455444, "step": 274 }, { "epoch": 0.44861337683523655, "grad_norm": 5.602413654327393, "learning_rate": 7.069856706597095e-06, "loss": 1.4013, "mean_token_accuracy": 0.6645569801330566, "step": 275 }, { "epoch": 0.45024469820554647, "grad_norm": 4.785682201385498, "learning_rate": 7.04578592473416e-06, "loss": 1.2362, "mean_token_accuracy": 0.6916077136993408, "step": 276 }, { "epoch": 0.45187601957585644, "grad_norm": 4.269145488739014, "learning_rate": 7.021664891869955e-06, "loss": 1.1638, "mean_token_accuracy": 0.7208147048950195, "step": 277 }, { "epoch": 0.4535073409461664, "grad_norm": 5.598394870758057, "learning_rate": 6.997494392140264e-06, "loss": 1.449, "mean_token_accuracy": 0.6509479880332947, "step": 278 }, { "epoch": 0.4551386623164764, "grad_norm": 4.253592491149902, "learning_rate": 6.973275211288953e-06, "loss": 0.962, "mean_token_accuracy": 0.7480490803718567, "step": 279 }, { "epoch": 0.4567699836867863, "grad_norm": 5.125161647796631, "learning_rate": 6.949008136642437e-06, "loss": 1.3255, "mean_token_accuracy": 0.6781437397003174, "step": 280 }, { "epoch": 0.45840130505709625, "grad_norm": 5.201466083526611, "learning_rate": 6.924693957084079e-06, "loss": 1.3969, "mean_token_accuracy": 0.6604675650596619, "step": 281 }, { "epoch": 0.4600326264274062, "grad_norm": 6.443404674530029, "learning_rate": 6.900333463028546e-06, "loss": 1.4835, "mean_token_accuracy": 0.6526094079017639, "step": 282 }, { "epoch": 0.46166394779771613, "grad_norm": 5.083189964294434, "learning_rate": 6.8759274463961145e-06, "loss": 1.3969, "mean_token_accuracy": 0.657814085483551, "step": 283 }, { "epoch": 0.4632952691680261, "grad_norm": 5.082605838775635, "learning_rate": 6.851476700586926e-06, "loss": 1.1498, "mean_token_accuracy": 0.7049723863601685, "step": 284 }, { "epoch": 0.46492659053833607, "grad_norm": 5.190065860748291, "learning_rate": 6.8269820204551985e-06, "loss": 1.3005, "mean_token_accuracy": 0.6958598494529724, "step": 285 }, { "epoch": 0.466557911908646, "grad_norm": 5.341015338897705, "learning_rate": 6.802444202283381e-06, "loss": 1.3399, "mean_token_accuracy": 0.6875981092453003, "step": 286 }, { "epoch": 0.46818923327895595, "grad_norm": 5.5725226402282715, "learning_rate": 6.777864043756268e-06, "loss": 1.2856, "mean_token_accuracy": 0.6699952483177185, "step": 287 }, { "epoch": 0.4698205546492659, "grad_norm": 5.266445636749268, "learning_rate": 6.7532423439350794e-06, "loss": 1.4138, "mean_token_accuracy": 0.6606606841087341, "step": 288 }, { "epoch": 0.47145187601957583, "grad_norm": 4.3366780281066895, "learning_rate": 6.728579903231463e-06, "loss": 0.9495, "mean_token_accuracy": 0.7561102509498596, "step": 289 }, { "epoch": 0.4730831973898858, "grad_norm": 5.066049575805664, "learning_rate": 6.703877523381495e-06, "loss": 1.4154, "mean_token_accuracy": 0.6551030874252319, "step": 290 }, { "epoch": 0.47471451876019577, "grad_norm": 5.101365089416504, "learning_rate": 6.679136007419607e-06, "loss": 1.1613, "mean_token_accuracy": 0.7094940543174744, "step": 291 }, { "epoch": 0.4763458401305057, "grad_norm": 5.026820182800293, "learning_rate": 6.654356159652483e-06, "loss": 1.1103, "mean_token_accuracy": 0.7332636117935181, "step": 292 }, { "epoch": 0.47797716150081565, "grad_norm": 5.343873977661133, "learning_rate": 6.629538785632912e-06, "loss": 1.2417, "mean_token_accuracy": 0.6913783550262451, "step": 293 }, { "epoch": 0.4796084828711256, "grad_norm": 4.890134334564209, "learning_rate": 6.604684692133597e-06, "loss": 1.1577, "mean_token_accuracy": 0.7185184955596924, "step": 294 }, { "epoch": 0.4812398042414356, "grad_norm": 4.612360000610352, "learning_rate": 6.579794687120938e-06, "loss": 1.1759, "mean_token_accuracy": 0.7133533954620361, "step": 295 }, { "epoch": 0.4828711256117455, "grad_norm": 5.377026081085205, "learning_rate": 6.554869579728753e-06, "loss": 1.3571, "mean_token_accuracy": 0.6833840012550354, "step": 296 }, { "epoch": 0.48450244698205547, "grad_norm": 4.644443988800049, "learning_rate": 6.5299101802319905e-06, "loss": 1.2068, "mean_token_accuracy": 0.678475558757782, "step": 297 }, { "epoch": 0.48613376835236544, "grad_norm": 5.2673869132995605, "learning_rate": 6.504917300020373e-06, "loss": 1.2203, "mean_token_accuracy": 0.7017102837562561, "step": 298 }, { "epoch": 0.48776508972267535, "grad_norm": 4.513294219970703, "learning_rate": 6.479891751572026e-06, "loss": 1.0001, "mean_token_accuracy": 0.7441986203193665, "step": 299 }, { "epoch": 0.4893964110929853, "grad_norm": 5.3284454345703125, "learning_rate": 6.454834348427077e-06, "loss": 1.4106, "mean_token_accuracy": 0.6686747074127197, "step": 300 }, { "epoch": 0.4910277324632953, "grad_norm": 4.518412113189697, "learning_rate": 6.429745905161183e-06, "loss": 1.0715, "mean_token_accuracy": 0.7324516773223877, "step": 301 }, { "epoch": 0.4926590538336052, "grad_norm": 6.271851062774658, "learning_rate": 6.404627237359078e-06, "loss": 1.3864, "mean_token_accuracy": 0.6706717610359192, "step": 302 }, { "epoch": 0.49429037520391517, "grad_norm": 5.340519905090332, "learning_rate": 6.379479161588039e-06, "loss": 1.3695, "mean_token_accuracy": 0.6908436417579651, "step": 303 }, { "epoch": 0.49592169657422513, "grad_norm": 5.797330856323242, "learning_rate": 6.354302495371352e-06, "loss": 1.5499, "mean_token_accuracy": 0.6514008641242981, "step": 304 }, { "epoch": 0.49755301794453505, "grad_norm": 5.730583667755127, "learning_rate": 6.329098057161731e-06, "loss": 1.2407, "mean_token_accuracy": 0.7070135474205017, "step": 305 }, { "epoch": 0.499184339314845, "grad_norm": 5.447910308837891, "learning_rate": 6.303866666314715e-06, "loss": 1.2594, "mean_token_accuracy": 0.6743515729904175, "step": 306 }, { "epoch": 0.5008156606851549, "grad_norm": 4.888600826263428, "learning_rate": 6.278609143062026e-06, "loss": 1.4212, "mean_token_accuracy": 0.6533401012420654, "step": 307 }, { "epoch": 0.5024469820554649, "grad_norm": 5.023259162902832, "learning_rate": 6.2533263084849095e-06, "loss": 1.149, "mean_token_accuracy": 0.7093348503112793, "step": 308 }, { "epoch": 0.5040783034257749, "grad_norm": 5.078878879547119, "learning_rate": 6.228018984487443e-06, "loss": 1.4057, "mean_token_accuracy": 0.6541189551353455, "step": 309 }, { "epoch": 0.5057096247960848, "grad_norm": 5.9124298095703125, "learning_rate": 6.202687993769811e-06, "loss": 1.381, "mean_token_accuracy": 0.6719298362731934, "step": 310 }, { "epoch": 0.5073409461663948, "grad_norm": 4.667541980743408, "learning_rate": 6.177334159801571e-06, "loss": 1.2029, "mean_token_accuracy": 0.703399121761322, "step": 311 }, { "epoch": 0.5089722675367048, "grad_norm": 5.1174116134643555, "learning_rate": 6.151958306794878e-06, "loss": 1.2424, "mean_token_accuracy": 0.6848514080047607, "step": 312 }, { "epoch": 0.5106035889070146, "grad_norm": 4.411402702331543, "learning_rate": 6.126561259677679e-06, "loss": 1.0155, "mean_token_accuracy": 0.7456547021865845, "step": 313 }, { "epoch": 0.5122349102773246, "grad_norm": 5.111578464508057, "learning_rate": 6.101143844066919e-06, "loss": 1.5141, "mean_token_accuracy": 0.6380020380020142, "step": 314 }, { "epoch": 0.5138662316476346, "grad_norm": 4.498473167419434, "learning_rate": 6.0757068862416855e-06, "loss": 1.0826, "mean_token_accuracy": 0.7336174845695496, "step": 315 }, { "epoch": 0.5154975530179445, "grad_norm": 4.840219974517822, "learning_rate": 6.050251213116356e-06, "loss": 1.1874, "mean_token_accuracy": 0.7011764645576477, "step": 316 }, { "epoch": 0.5171288743882545, "grad_norm": 5.577286243438721, "learning_rate": 6.024777652213702e-06, "loss": 1.3661, "mean_token_accuracy": 0.6853360533714294, "step": 317 }, { "epoch": 0.5187601957585645, "grad_norm": 5.854229927062988, "learning_rate": 5.9992870316380085e-06, "loss": 1.3195, "mean_token_accuracy": 0.6762208342552185, "step": 318 }, { "epoch": 0.5203915171288744, "grad_norm": 4.742856502532959, "learning_rate": 5.973780180048138e-06, "loss": 1.3327, "mean_token_accuracy": 0.6779661178588867, "step": 319 }, { "epoch": 0.5220228384991843, "grad_norm": 4.710707187652588, "learning_rate": 5.948257926630594e-06, "loss": 1.2339, "mean_token_accuracy": 0.6876561641693115, "step": 320 }, { "epoch": 0.5236541598694943, "grad_norm": 4.941910266876221, "learning_rate": 5.9227211010725774e-06, "loss": 1.2255, "mean_token_accuracy": 0.6985294222831726, "step": 321 }, { "epoch": 0.5252854812398042, "grad_norm": 4.412341594696045, "learning_rate": 5.897170533534997e-06, "loss": 1.0061, "mean_token_accuracy": 0.7371076345443726, "step": 322 }, { "epoch": 0.5269168026101142, "grad_norm": 5.1144843101501465, "learning_rate": 5.871607054625497e-06, "loss": 1.2831, "mean_token_accuracy": 0.6948955655097961, "step": 323 }, { "epoch": 0.5285481239804242, "grad_norm": 4.727828025817871, "learning_rate": 5.846031495371445e-06, "loss": 1.1593, "mean_token_accuracy": 0.716786801815033, "step": 324 }, { "epoch": 0.5301794453507341, "grad_norm": 4.1832451820373535, "learning_rate": 5.820444687192922e-06, "loss": 0.8687, "mean_token_accuracy": 0.7938080430030823, "step": 325 }, { "epoch": 0.531810766721044, "grad_norm": 4.599882125854492, "learning_rate": 5.794847461875699e-06, "loss": 1.2684, "mean_token_accuracy": 0.7069767713546753, "step": 326 }, { "epoch": 0.533442088091354, "grad_norm": 4.93676233291626, "learning_rate": 5.769240651544182e-06, "loss": 1.3537, "mean_token_accuracy": 0.6710861921310425, "step": 327 }, { "epoch": 0.5350734094616639, "grad_norm": 5.627974510192871, "learning_rate": 5.74362508863438e-06, "loss": 1.1545, "mean_token_accuracy": 0.6976369619369507, "step": 328 }, { "epoch": 0.5367047308319739, "grad_norm": 4.828066349029541, "learning_rate": 5.7180016058668255e-06, "loss": 1.3031, "mean_token_accuracy": 0.6644359230995178, "step": 329 }, { "epoch": 0.5383360522022839, "grad_norm": 4.859476089477539, "learning_rate": 5.692371036219517e-06, "loss": 1.2398, "mean_token_accuracy": 0.6936061382293701, "step": 330 }, { "epoch": 0.5399673735725938, "grad_norm": 5.192332744598389, "learning_rate": 5.666734212900838e-06, "loss": 1.4352, "mean_token_accuracy": 0.6518259048461914, "step": 331 }, { "epoch": 0.5415986949429038, "grad_norm": 5.073202610015869, "learning_rate": 5.641091969322462e-06, "loss": 1.4968, "mean_token_accuracy": 0.6290949583053589, "step": 332 }, { "epoch": 0.5432300163132137, "grad_norm": 5.155499458312988, "learning_rate": 5.615445139072276e-06, "loss": 1.2214, "mean_token_accuracy": 0.6994413137435913, "step": 333 }, { "epoch": 0.5448613376835236, "grad_norm": 5.361922264099121, "learning_rate": 5.589794555887261e-06, "loss": 1.3211, "mean_token_accuracy": 0.6952879428863525, "step": 334 }, { "epoch": 0.5464926590538336, "grad_norm": 4.938032627105713, "learning_rate": 5.564141053626412e-06, "loss": 1.0671, "mean_token_accuracy": 0.7365955114364624, "step": 335 }, { "epoch": 0.5481239804241436, "grad_norm": 5.547269344329834, "learning_rate": 5.538485466243609e-06, "loss": 1.1093, "mean_token_accuracy": 0.7243272662162781, "step": 336 }, { "epoch": 0.5497553017944535, "grad_norm": 4.6895318031311035, "learning_rate": 5.512828627760519e-06, "loss": 1.1681, "mean_token_accuracy": 0.721276581287384, "step": 337 }, { "epoch": 0.5513866231647635, "grad_norm": 4.660412311553955, "learning_rate": 5.487171372239484e-06, "loss": 1.0067, "mean_token_accuracy": 0.756860613822937, "step": 338 }, { "epoch": 0.5530179445350734, "grad_norm": 4.920834541320801, "learning_rate": 5.461514533756394e-06, "loss": 1.1513, "mean_token_accuracy": 0.7190653085708618, "step": 339 }, { "epoch": 0.5546492659053833, "grad_norm": 4.938427925109863, "learning_rate": 5.435858946373589e-06, "loss": 1.2603, "mean_token_accuracy": 0.6953125, "step": 340 }, { "epoch": 0.5562805872756933, "grad_norm": 5.092044830322266, "learning_rate": 5.410205444112739e-06, "loss": 1.3949, "mean_token_accuracy": 0.6499231457710266, "step": 341 }, { "epoch": 0.5579119086460033, "grad_norm": 5.051768779754639, "learning_rate": 5.384554860927727e-06, "loss": 1.2452, "mean_token_accuracy": 0.6954964399337769, "step": 342 }, { "epoch": 0.5595432300163132, "grad_norm": 5.367892265319824, "learning_rate": 5.35890803067754e-06, "loss": 1.4017, "mean_token_accuracy": 0.6810073256492615, "step": 343 }, { "epoch": 0.5611745513866232, "grad_norm": 5.445290565490723, "learning_rate": 5.333265787099165e-06, "loss": 1.3892, "mean_token_accuracy": 0.6558409929275513, "step": 344 }, { "epoch": 0.5628058727569332, "grad_norm": 4.5011091232299805, "learning_rate": 5.307628963780486e-06, "loss": 1.134, "mean_token_accuracy": 0.7164948582649231, "step": 345 }, { "epoch": 0.564437194127243, "grad_norm": 4.943375587463379, "learning_rate": 5.281998394133177e-06, "loss": 1.2984, "mean_token_accuracy": 0.6739248633384705, "step": 346 }, { "epoch": 0.566068515497553, "grad_norm": 5.216898441314697, "learning_rate": 5.256374911365621e-06, "loss": 1.0943, "mean_token_accuracy": 0.742380678653717, "step": 347 }, { "epoch": 0.567699836867863, "grad_norm": 5.109920501708984, "learning_rate": 5.2307593484558175e-06, "loss": 1.2526, "mean_token_accuracy": 0.7040214538574219, "step": 348 }, { "epoch": 0.5693311582381729, "grad_norm": 5.156416416168213, "learning_rate": 5.205152538124303e-06, "loss": 1.3782, "mean_token_accuracy": 0.6628924608230591, "step": 349 }, { "epoch": 0.5709624796084829, "grad_norm": 4.267928600311279, "learning_rate": 5.179555312807079e-06, "loss": 1.0406, "mean_token_accuracy": 0.7428425550460815, "step": 350 }, { "epoch": 0.5725938009787929, "grad_norm": 4.501908302307129, "learning_rate": 5.153968504628558e-06, "loss": 1.177, "mean_token_accuracy": 0.7172932624816895, "step": 351 }, { "epoch": 0.5742251223491027, "grad_norm": 4.767745494842529, "learning_rate": 5.1283929453745055e-06, "loss": 1.0175, "mean_token_accuracy": 0.7485062479972839, "step": 352 }, { "epoch": 0.5758564437194127, "grad_norm": 4.706437110900879, "learning_rate": 5.102829466465005e-06, "loss": 1.4045, "mean_token_accuracy": 0.6768350601196289, "step": 353 }, { "epoch": 0.5774877650897227, "grad_norm": 4.489633083343506, "learning_rate": 5.077278898927425e-06, "loss": 1.1147, "mean_token_accuracy": 0.7224782109260559, "step": 354 }, { "epoch": 0.5791190864600326, "grad_norm": 5.124320030212402, "learning_rate": 5.051742073369407e-06, "loss": 1.3278, "mean_token_accuracy": 0.6733261346817017, "step": 355 }, { "epoch": 0.5807504078303426, "grad_norm": 4.827151298522949, "learning_rate": 5.026219819951865e-06, "loss": 1.0634, "mean_token_accuracy": 0.7362812757492065, "step": 356 }, { "epoch": 0.5823817292006526, "grad_norm": 4.903264045715332, "learning_rate": 5.000712968361994e-06, "loss": 1.2472, "mean_token_accuracy": 0.6971870064735413, "step": 357 }, { "epoch": 0.5840130505709625, "grad_norm": 5.2526421546936035, "learning_rate": 4.975222347786299e-06, "loss": 1.4272, "mean_token_accuracy": 0.6600102186203003, "step": 358 }, { "epoch": 0.5856443719412724, "grad_norm": 4.291179656982422, "learning_rate": 4.949748786883647e-06, "loss": 1.058, "mean_token_accuracy": 0.7493276000022888, "step": 359 }, { "epoch": 0.5872756933115824, "grad_norm": 5.2762041091918945, "learning_rate": 4.924293113758314e-06, "loss": 1.3768, "mean_token_accuracy": 0.6687370538711548, "step": 360 }, { "epoch": 0.5889070146818923, "grad_norm": 4.883618354797363, "learning_rate": 4.898856155933084e-06, "loss": 1.2404, "mean_token_accuracy": 0.7043189406394958, "step": 361 }, { "epoch": 0.5905383360522023, "grad_norm": 5.123283863067627, "learning_rate": 4.873438740322325e-06, "loss": 1.2315, "mean_token_accuracy": 0.7245850563049316, "step": 362 }, { "epoch": 0.5921696574225123, "grad_norm": 5.0717267990112305, "learning_rate": 4.8480416932051255e-06, "loss": 1.29, "mean_token_accuracy": 0.6664901971817017, "step": 363 }, { "epoch": 0.5938009787928222, "grad_norm": 5.272220611572266, "learning_rate": 4.8226658401984295e-06, "loss": 1.4004, "mean_token_accuracy": 0.665610134601593, "step": 364 }, { "epoch": 0.5954323001631321, "grad_norm": 4.845883846282959, "learning_rate": 4.79731200623019e-06, "loss": 1.2454, "mean_token_accuracy": 0.695147693157196, "step": 365 }, { "epoch": 0.5970636215334421, "grad_norm": 4.378237247467041, "learning_rate": 4.771981015512559e-06, "loss": 0.8819, "mean_token_accuracy": 0.7768194079399109, "step": 366 }, { "epoch": 0.598694942903752, "grad_norm": 5.140429973602295, "learning_rate": 4.746673691515093e-06, "loss": 1.2651, "mean_token_accuracy": 0.6864282488822937, "step": 367 }, { "epoch": 0.600326264274062, "grad_norm": 5.289265155792236, "learning_rate": 4.721390856937976e-06, "loss": 1.1395, "mean_token_accuracy": 0.7040935754776001, "step": 368 }, { "epoch": 0.601957585644372, "grad_norm": 5.229256629943848, "learning_rate": 4.696133333685286e-06, "loss": 1.2456, "mean_token_accuracy": 0.6875337362289429, "step": 369 }, { "epoch": 0.6035889070146819, "grad_norm": 5.632972240447998, "learning_rate": 4.67090194283827e-06, "loss": 1.3961, "mean_token_accuracy": 0.670738160610199, "step": 370 }, { "epoch": 0.6052202283849919, "grad_norm": 4.017366409301758, "learning_rate": 4.645697504628649e-06, "loss": 0.9787, "mean_token_accuracy": 0.7547547817230225, "step": 371 }, { "epoch": 0.6068515497553018, "grad_norm": 5.327452182769775, "learning_rate": 4.6205208384119626e-06, "loss": 1.201, "mean_token_accuracy": 0.7167056202888489, "step": 372 }, { "epoch": 0.6084828711256117, "grad_norm": 5.935764312744141, "learning_rate": 4.595372762640924e-06, "loss": 1.6929, "mean_token_accuracy": 0.598901093006134, "step": 373 }, { "epoch": 0.6101141924959217, "grad_norm": 4.699948310852051, "learning_rate": 4.57025409483882e-06, "loss": 1.0746, "mean_token_accuracy": 0.753125011920929, "step": 374 }, { "epoch": 0.6117455138662317, "grad_norm": 4.765663146972656, "learning_rate": 4.545165651572926e-06, "loss": 1.2652, "mean_token_accuracy": 0.707344651222229, "step": 375 }, { "epoch": 0.6133768352365416, "grad_norm": 5.047967433929443, "learning_rate": 4.520108248427975e-06, "loss": 1.3292, "mean_token_accuracy": 0.67514967918396, "step": 376 }, { "epoch": 0.6150081566068516, "grad_norm": 4.735743045806885, "learning_rate": 4.49508269997963e-06, "loss": 1.2413, "mean_token_accuracy": 0.6973969340324402, "step": 377 }, { "epoch": 0.6166394779771615, "grad_norm": 4.652902603149414, "learning_rate": 4.470089819768011e-06, "loss": 1.088, "mean_token_accuracy": 0.7307506203651428, "step": 378 }, { "epoch": 0.6182707993474714, "grad_norm": 5.012315273284912, "learning_rate": 4.4451304202712486e-06, "loss": 1.1939, "mean_token_accuracy": 0.6982803344726562, "step": 379 }, { "epoch": 0.6199021207177814, "grad_norm": 5.312098026275635, "learning_rate": 4.420205312879065e-06, "loss": 1.1707, "mean_token_accuracy": 0.7069069147109985, "step": 380 }, { "epoch": 0.6215334420880914, "grad_norm": 5.2296462059021, "learning_rate": 4.395315307866404e-06, "loss": 1.4177, "mean_token_accuracy": 0.6581963300704956, "step": 381 }, { "epoch": 0.6231647634584013, "grad_norm": 5.124307155609131, "learning_rate": 4.37046121436709e-06, "loss": 1.4677, "mean_token_accuracy": 0.6682761907577515, "step": 382 }, { "epoch": 0.6247960848287113, "grad_norm": 5.106579303741455, "learning_rate": 4.3456438403475174e-06, "loss": 1.3623, "mean_token_accuracy": 0.670346200466156, "step": 383 }, { "epoch": 0.6264274061990212, "grad_norm": 5.028356552124023, "learning_rate": 4.320863992580393e-06, "loss": 1.2408, "mean_token_accuracy": 0.6920192837715149, "step": 384 }, { "epoch": 0.6280587275693311, "grad_norm": 4.444116115570068, "learning_rate": 4.296122476618507e-06, "loss": 1.1429, "mean_token_accuracy": 0.704580545425415, "step": 385 }, { "epoch": 0.6296900489396411, "grad_norm": 4.459901809692383, "learning_rate": 4.2714200967685405e-06, "loss": 1.1629, "mean_token_accuracy": 0.72246915102005, "step": 386 }, { "epoch": 0.6313213703099511, "grad_norm": 6.285675525665283, "learning_rate": 4.246757656064924e-06, "loss": 1.4322, "mean_token_accuracy": 0.6744464635848999, "step": 387 }, { "epoch": 0.632952691680261, "grad_norm": 4.624763011932373, "learning_rate": 4.222135956243732e-06, "loss": 1.2574, "mean_token_accuracy": 0.681078314781189, "step": 388 }, { "epoch": 0.634584013050571, "grad_norm": 4.69317626953125, "learning_rate": 4.19755579771662e-06, "loss": 0.912, "mean_token_accuracy": 0.7686527371406555, "step": 389 }, { "epoch": 0.636215334420881, "grad_norm": 5.052898406982422, "learning_rate": 4.173017979544804e-06, "loss": 1.2663, "mean_token_accuracy": 0.6840921640396118, "step": 390 }, { "epoch": 0.6378466557911908, "grad_norm": 4.830198287963867, "learning_rate": 4.148523299413075e-06, "loss": 1.1944, "mean_token_accuracy": 0.7132551670074463, "step": 391 }, { "epoch": 0.6394779771615008, "grad_norm": 4.418455600738525, "learning_rate": 4.124072553603887e-06, "loss": 1.1429, "mean_token_accuracy": 0.720703125, "step": 392 }, { "epoch": 0.6411092985318108, "grad_norm": 5.233757019042969, "learning_rate": 4.099666536971456e-06, "loss": 1.4604, "mean_token_accuracy": 0.6442708373069763, "step": 393 }, { "epoch": 0.6427406199021207, "grad_norm": 5.798137187957764, "learning_rate": 4.075306042915922e-06, "loss": 1.4581, "mean_token_accuracy": 0.6500260233879089, "step": 394 }, { "epoch": 0.6443719412724307, "grad_norm": 5.040640354156494, "learning_rate": 4.050991863357564e-06, "loss": 1.0962, "mean_token_accuracy": 0.7073915600776672, "step": 395 }, { "epoch": 0.6460032626427407, "grad_norm": 4.5930352210998535, "learning_rate": 4.026724788711047e-06, "loss": 1.1013, "mean_token_accuracy": 0.7120794057846069, "step": 396 }, { "epoch": 0.6476345840130505, "grad_norm": 4.828030586242676, "learning_rate": 4.002505607859738e-06, "loss": 1.1984, "mean_token_accuracy": 0.7033898234367371, "step": 397 }, { "epoch": 0.6492659053833605, "grad_norm": 4.7295331954956055, "learning_rate": 3.978335108130047e-06, "loss": 1.0876, "mean_token_accuracy": 0.7375543713569641, "step": 398 }, { "epoch": 0.6508972267536705, "grad_norm": 5.275457859039307, "learning_rate": 3.954214075265842e-06, "loss": 1.2306, "mean_token_accuracy": 0.7018927335739136, "step": 399 }, { "epoch": 0.6525285481239804, "grad_norm": 5.105504989624023, "learning_rate": 3.930143293402907e-06, "loss": 1.3803, "mean_token_accuracy": 0.637172520160675, "step": 400 }, { "epoch": 0.6541598694942904, "grad_norm": 4.749675750732422, "learning_rate": 3.906123545043441e-06, "loss": 1.1234, "mean_token_accuracy": 0.7189365029335022, "step": 401 }, { "epoch": 0.6557911908646004, "grad_norm": 4.926899433135986, "learning_rate": 3.882155611030631e-06, "loss": 1.2681, "mean_token_accuracy": 0.6934037208557129, "step": 402 }, { "epoch": 0.6574225122349103, "grad_norm": 5.266970157623291, "learning_rate": 3.858240270523262e-06, "loss": 1.3901, "mean_token_accuracy": 0.6708167195320129, "step": 403 }, { "epoch": 0.6590538336052202, "grad_norm": 4.678843975067139, "learning_rate": 3.834378300970385e-06, "loss": 1.096, "mean_token_accuracy": 0.7200217247009277, "step": 404 }, { "epoch": 0.6606851549755302, "grad_norm": 5.216601371765137, "learning_rate": 3.8105704780860575e-06, "loss": 1.515, "mean_token_accuracy": 0.6313887238502502, "step": 405 }, { "epoch": 0.6623164763458401, "grad_norm": 5.138411045074463, "learning_rate": 3.7868175758241065e-06, "loss": 1.2448, "mean_token_accuracy": 0.7139689326286316, "step": 406 }, { "epoch": 0.6639477977161501, "grad_norm": 4.843050479888916, "learning_rate": 3.7631203663529823e-06, "loss": 1.3766, "mean_token_accuracy": 0.6812297701835632, "step": 407 }, { "epoch": 0.6655791190864601, "grad_norm": 4.814765453338623, "learning_rate": 3.739479620030655e-06, "loss": 1.0831, "mean_token_accuracy": 0.7297152280807495, "step": 408 }, { "epoch": 0.66721044045677, "grad_norm": 4.954052448272705, "learning_rate": 3.715896105379562e-06, "loss": 1.2928, "mean_token_accuracy": 0.6796213388442993, "step": 409 }, { "epoch": 0.6688417618270799, "grad_norm": 4.973556995391846, "learning_rate": 3.692370589061639e-06, "loss": 1.203, "mean_token_accuracy": 0.7126886248588562, "step": 410 }, { "epoch": 0.6704730831973899, "grad_norm": 4.508687973022461, "learning_rate": 3.668903835853386e-06, "loss": 1.0417, "mean_token_accuracy": 0.7396226525306702, "step": 411 }, { "epoch": 0.6721044045676998, "grad_norm": 4.325466632843018, "learning_rate": 3.64549660862101e-06, "loss": 1.0965, "mean_token_accuracy": 0.7506775259971619, "step": 412 }, { "epoch": 0.6737357259380098, "grad_norm": 4.78257417678833, "learning_rate": 3.6221496682956236e-06, "loss": 1.2328, "mean_token_accuracy": 0.6968838572502136, "step": 413 }, { "epoch": 0.6753670473083198, "grad_norm": 5.217673301696777, "learning_rate": 3.5988637738485146e-06, "loss": 1.1468, "mean_token_accuracy": 0.7180641293525696, "step": 414 }, { "epoch": 0.6769983686786297, "grad_norm": 5.608780384063721, "learning_rate": 3.5756396822664595e-06, "loss": 1.4427, "mean_token_accuracy": 0.6482036113739014, "step": 415 }, { "epoch": 0.6786296900489397, "grad_norm": 4.913776397705078, "learning_rate": 3.5524781485271287e-06, "loss": 1.3126, "mean_token_accuracy": 0.703459620475769, "step": 416 }, { "epoch": 0.6802610114192496, "grad_norm": 4.990585803985596, "learning_rate": 3.5293799255745407e-06, "loss": 1.425, "mean_token_accuracy": 0.6552053689956665, "step": 417 }, { "epoch": 0.6818923327895595, "grad_norm": 5.035621643066406, "learning_rate": 3.5063457642945788e-06, "loss": 1.3351, "mean_token_accuracy": 0.6864407062530518, "step": 418 }, { "epoch": 0.6835236541598695, "grad_norm": 5.281700134277344, "learning_rate": 3.4833764134905835e-06, "loss": 1.2133, "mean_token_accuracy": 0.6881720423698425, "step": 419 }, { "epoch": 0.6851549755301795, "grad_norm": 4.842787742614746, "learning_rate": 3.4604726198590177e-06, "loss": 1.1954, "mean_token_accuracy": 0.7155085802078247, "step": 420 }, { "epoch": 0.6867862969004894, "grad_norm": 4.937472820281982, "learning_rate": 3.4376351279651788e-06, "loss": 1.3095, "mean_token_accuracy": 0.6968302726745605, "step": 421 }, { "epoch": 0.6884176182707994, "grad_norm": 4.842049598693848, "learning_rate": 3.4148646802190066e-06, "loss": 1.0614, "mean_token_accuracy": 0.7444320917129517, "step": 422 }, { "epoch": 0.6900489396411092, "grad_norm": 4.4336628913879395, "learning_rate": 3.392162016850945e-06, "loss": 1.0914, "mean_token_accuracy": 0.729891300201416, "step": 423 }, { "epoch": 0.6916802610114192, "grad_norm": 5.191675186157227, "learning_rate": 3.369527875887875e-06, "loss": 1.2101, "mean_token_accuracy": 0.7204244136810303, "step": 424 }, { "epoch": 0.6933115823817292, "grad_norm": 5.435412406921387, "learning_rate": 3.346962993129125e-06, "loss": 1.2044, "mean_token_accuracy": 0.7158119678497314, "step": 425 }, { "epoch": 0.6949429037520392, "grad_norm": 4.86824369430542, "learning_rate": 3.3244681021225506e-06, "loss": 1.1128, "mean_token_accuracy": 0.7002801299095154, "step": 426 }, { "epoch": 0.6965742251223491, "grad_norm": 4.692442417144775, "learning_rate": 3.302043934140693e-06, "loss": 1.247, "mean_token_accuracy": 0.683964729309082, "step": 427 }, { "epoch": 0.6982055464926591, "grad_norm": 4.847585201263428, "learning_rate": 3.279691218156998e-06, "loss": 1.2886, "mean_token_accuracy": 0.6823869347572327, "step": 428 }, { "epoch": 0.6998368678629691, "grad_norm": 4.947258472442627, "learning_rate": 3.2574106808221206e-06, "loss": 1.1626, "mean_token_accuracy": 0.7202127575874329, "step": 429 }, { "epoch": 0.7014681892332789, "grad_norm": 4.548014163970947, "learning_rate": 3.2352030464403117e-06, "loss": 1.1406, "mean_token_accuracy": 0.7432366013526917, "step": 430 }, { "epoch": 0.7030995106035889, "grad_norm": 4.8469743728637695, "learning_rate": 3.2130690369458594e-06, "loss": 1.2848, "mean_token_accuracy": 0.6986584067344666, "step": 431 }, { "epoch": 0.7047308319738989, "grad_norm": 4.121768474578857, "learning_rate": 3.191009371879627e-06, "loss": 0.9523, "mean_token_accuracy": 0.7665964365005493, "step": 432 }, { "epoch": 0.7063621533442088, "grad_norm": 4.720678329467773, "learning_rate": 3.1690247683656617e-06, "loss": 1.2706, "mean_token_accuracy": 0.6949771642684937, "step": 433 }, { "epoch": 0.7079934747145188, "grad_norm": 4.939698219299316, "learning_rate": 3.1471159410878784e-06, "loss": 1.3505, "mean_token_accuracy": 0.6539000272750854, "step": 434 }, { "epoch": 0.7096247960848288, "grad_norm": 4.132518291473389, "learning_rate": 3.125283602266832e-06, "loss": 0.9859, "mean_token_accuracy": 0.7509416341781616, "step": 435 }, { "epoch": 0.7112561174551386, "grad_norm": 5.197821617126465, "learning_rate": 3.1035284616365586e-06, "loss": 1.1386, "mean_token_accuracy": 0.714631199836731, "step": 436 }, { "epoch": 0.7128874388254486, "grad_norm": 4.722414016723633, "learning_rate": 3.0818512264215107e-06, "loss": 0.9581, "mean_token_accuracy": 0.7724301815032959, "step": 437 }, { "epoch": 0.7145187601957586, "grad_norm": 4.7281599044799805, "learning_rate": 3.060252601313557e-06, "loss": 1.1291, "mean_token_accuracy": 0.717391312122345, "step": 438 }, { "epoch": 0.7161500815660685, "grad_norm": 4.817330360412598, "learning_rate": 3.0387332884490806e-06, "loss": 1.1184, "mean_token_accuracy": 0.725653886795044, "step": 439 }, { "epoch": 0.7177814029363785, "grad_norm": 4.662072658538818, "learning_rate": 3.0172939873861486e-06, "loss": 1.1475, "mean_token_accuracy": 0.7279778122901917, "step": 440 }, { "epoch": 0.7194127243066885, "grad_norm": 4.278316020965576, "learning_rate": 2.995935395081781e-06, "loss": 0.9249, "mean_token_accuracy": 0.7603439092636108, "step": 441 }, { "epoch": 0.7210440456769984, "grad_norm": 4.619575500488281, "learning_rate": 2.9746582058692803e-06, "loss": 1.0338, "mean_token_accuracy": 0.7423180341720581, "step": 442 }, { "epoch": 0.7226753670473083, "grad_norm": 4.950908660888672, "learning_rate": 2.953463111435666e-06, "loss": 1.1649, "mean_token_accuracy": 0.7079599499702454, "step": 443 }, { "epoch": 0.7243066884176182, "grad_norm": 5.330234050750732, "learning_rate": 2.932350800799196e-06, "loss": 1.308, "mean_token_accuracy": 0.6914836764335632, "step": 444 }, { "epoch": 0.7259380097879282, "grad_norm": 5.278100490570068, "learning_rate": 2.9113219602869515e-06, "loss": 1.5142, "mean_token_accuracy": 0.6575073599815369, "step": 445 }, { "epoch": 0.7275693311582382, "grad_norm": 4.93350076675415, "learning_rate": 2.890377273512538e-06, "loss": 1.3363, "mean_token_accuracy": 0.6751728057861328, "step": 446 }, { "epoch": 0.7292006525285482, "grad_norm": 4.3188910484313965, "learning_rate": 2.8695174213538647e-06, "loss": 1.0682, "mean_token_accuracy": 0.7265364527702332, "step": 447 }, { "epoch": 0.7308319738988581, "grad_norm": 4.598663806915283, "learning_rate": 2.848743081930998e-06, "loss": 1.1568, "mean_token_accuracy": 0.7006726264953613, "step": 448 }, { "epoch": 0.732463295269168, "grad_norm": 5.178636074066162, "learning_rate": 2.8280549305841265e-06, "loss": 1.2928, "mean_token_accuracy": 0.6874651908874512, "step": 449 }, { "epoch": 0.734094616639478, "grad_norm": 5.297123908996582, "learning_rate": 2.8074536398516004e-06, "loss": 1.2612, "mean_token_accuracy": 0.6888131499290466, "step": 450 }, { "epoch": 0.7357259380097879, "grad_norm": 5.056674957275391, "learning_rate": 2.7869398794480778e-06, "loss": 1.1595, "mean_token_accuracy": 0.7092235684394836, "step": 451 }, { "epoch": 0.7373572593800979, "grad_norm": 4.8112030029296875, "learning_rate": 2.7665143162427427e-06, "loss": 1.2288, "mean_token_accuracy": 0.7047522664070129, "step": 452 }, { "epoch": 0.7389885807504079, "grad_norm": 4.844231605529785, "learning_rate": 2.746177614237631e-06, "loss": 1.3594, "mean_token_accuracy": 0.6892874240875244, "step": 453 }, { "epoch": 0.7406199021207178, "grad_norm": 5.323098659515381, "learning_rate": 2.7259304345460445e-06, "loss": 1.4409, "mean_token_accuracy": 0.6324736475944519, "step": 454 }, { "epoch": 0.7422512234910277, "grad_norm": 4.965455532073975, "learning_rate": 2.7057734353710655e-06, "loss": 1.2032, "mean_token_accuracy": 0.6982530355453491, "step": 455 }, { "epoch": 0.7438825448613376, "grad_norm": 4.611636161804199, "learning_rate": 2.6857072719841436e-06, "loss": 1.0921, "mean_token_accuracy": 0.7258726954460144, "step": 456 }, { "epoch": 0.7455138662316476, "grad_norm": 5.415761470794678, "learning_rate": 2.6657325967038084e-06, "loss": 1.4882, "mean_token_accuracy": 0.6622621417045593, "step": 457 }, { "epoch": 0.7471451876019576, "grad_norm": 5.130191326141357, "learning_rate": 2.645850058874463e-06, "loss": 1.2448, "mean_token_accuracy": 0.6971870064735413, "step": 458 }, { "epoch": 0.7487765089722676, "grad_norm": 4.7735748291015625, "learning_rate": 2.6260603048452636e-06, "loss": 1.2079, "mean_token_accuracy": 0.7042531967163086, "step": 459 }, { "epoch": 0.7504078303425775, "grad_norm": 4.764122486114502, "learning_rate": 2.6063639779491197e-06, "loss": 1.3132, "mean_token_accuracy": 0.677205502986908, "step": 460 }, { "epoch": 0.7520391517128875, "grad_norm": 4.8977556228637695, "learning_rate": 2.586761718481776e-06, "loss": 1.0483, "mean_token_accuracy": 0.7458379864692688, "step": 461 }, { "epoch": 0.7536704730831973, "grad_norm": 5.250521183013916, "learning_rate": 2.5672541636809957e-06, "loss": 1.3854, "mean_token_accuracy": 0.6714285612106323, "step": 462 }, { "epoch": 0.7553017944535073, "grad_norm": 4.352292537689209, "learning_rate": 2.5478419477058446e-06, "loss": 1.2105, "mean_token_accuracy": 0.714142918586731, "step": 463 }, { "epoch": 0.7569331158238173, "grad_norm": 4.649628162384033, "learning_rate": 2.52852570161608e-06, "loss": 1.1386, "mean_token_accuracy": 0.721030056476593, "step": 464 }, { "epoch": 0.7585644371941273, "grad_norm": 5.159845352172852, "learning_rate": 2.5093060533516357e-06, "loss": 1.0597, "mean_token_accuracy": 0.7296990156173706, "step": 465 }, { "epoch": 0.7601957585644372, "grad_norm": 4.948349475860596, "learning_rate": 2.4901836277122e-06, "loss": 1.2113, "mean_token_accuracy": 0.6993117928504944, "step": 466 }, { "epoch": 0.7618270799347472, "grad_norm": 4.682156085968018, "learning_rate": 2.4711590463369163e-06, "loss": 1.1495, "mean_token_accuracy": 0.7079691290855408, "step": 467 }, { "epoch": 0.763458401305057, "grad_norm": 4.9600830078125, "learning_rate": 2.4522329276841664e-06, "loss": 1.2248, "mean_token_accuracy": 0.7208632826805115, "step": 468 }, { "epoch": 0.765089722675367, "grad_norm": 5.011682033538818, "learning_rate": 2.4334058870114685e-06, "loss": 1.2514, "mean_token_accuracy": 0.690378725528717, "step": 469 }, { "epoch": 0.766721044045677, "grad_norm": 6.021939754486084, "learning_rate": 2.414678536355476e-06, "loss": 1.1848, "mean_token_accuracy": 0.7004357576370239, "step": 470 }, { "epoch": 0.768352365415987, "grad_norm": 5.621747970581055, "learning_rate": 2.3960514845120835e-06, "loss": 1.3135, "mean_token_accuracy": 0.6799768805503845, "step": 471 }, { "epoch": 0.7699836867862969, "grad_norm": 5.001407623291016, "learning_rate": 2.377525337016629e-06, "loss": 1.1641, "mean_token_accuracy": 0.7319232821464539, "step": 472 }, { "epoch": 0.7716150081566069, "grad_norm": 4.856801509857178, "learning_rate": 2.359100696124217e-06, "loss": 1.2248, "mean_token_accuracy": 0.7054263353347778, "step": 473 }, { "epoch": 0.7732463295269169, "grad_norm": 5.092650890350342, "learning_rate": 2.340778160790133e-06, "loss": 1.2368, "mean_token_accuracy": 0.6984392404556274, "step": 474 }, { "epoch": 0.7748776508972267, "grad_norm": 5.131616592407227, "learning_rate": 2.32255832665038e-06, "loss": 1.1432, "mean_token_accuracy": 0.7190889120101929, "step": 475 }, { "epoch": 0.7765089722675367, "grad_norm": 5.5193047523498535, "learning_rate": 2.3044417860023082e-06, "loss": 1.4145, "mean_token_accuracy": 0.6792343258857727, "step": 476 }, { "epoch": 0.7781402936378466, "grad_norm": 4.5522356033325195, "learning_rate": 2.286429127785365e-06, "loss": 1.2906, "mean_token_accuracy": 0.6974206566810608, "step": 477 }, { "epoch": 0.7797716150081566, "grad_norm": 4.760054588317871, "learning_rate": 2.2685209375619433e-06, "loss": 1.2122, "mean_token_accuracy": 0.7080909609794617, "step": 478 }, { "epoch": 0.7814029363784666, "grad_norm": 4.7698187828063965, "learning_rate": 2.250717797498361e-06, "loss": 1.2056, "mean_token_accuracy": 0.7150395512580872, "step": 479 }, { "epoch": 0.7830342577487766, "grad_norm": 5.215602397918701, "learning_rate": 2.2330202863459123e-06, "loss": 1.417, "mean_token_accuracy": 0.6677489280700684, "step": 480 }, { "epoch": 0.7846655791190864, "grad_norm": 5.066779136657715, "learning_rate": 2.215428979422074e-06, "loss": 1.3455, "mean_token_accuracy": 0.6654175519943237, "step": 481 }, { "epoch": 0.7862969004893964, "grad_norm": 4.236968994140625, "learning_rate": 2.1979444485917957e-06, "loss": 1.2404, "mean_token_accuracy": 0.7059952020645142, "step": 482 }, { "epoch": 0.7879282218597063, "grad_norm": 4.6524224281311035, "learning_rate": 2.1805672622489044e-06, "loss": 1.2244, "mean_token_accuracy": 0.6920965909957886, "step": 483 }, { "epoch": 0.7895595432300163, "grad_norm": 4.233443737030029, "learning_rate": 2.163297985297633e-06, "loss": 1.015, "mean_token_accuracy": 0.7436676621437073, "step": 484 }, { "epoch": 0.7911908646003263, "grad_norm": 4.818909168243408, "learning_rate": 2.1461371791342572e-06, "loss": 1.1409, "mean_token_accuracy": 0.7303598523139954, "step": 485 }, { "epoch": 0.7928221859706363, "grad_norm": 5.0629448890686035, "learning_rate": 2.129085401628841e-06, "loss": 1.263, "mean_token_accuracy": 0.6733444333076477, "step": 486 }, { "epoch": 0.7944535073409462, "grad_norm": 5.042863368988037, "learning_rate": 2.1121432071071008e-06, "loss": 1.2654, "mean_token_accuracy": 0.6947311162948608, "step": 487 }, { "epoch": 0.7960848287112561, "grad_norm": 4.359389305114746, "learning_rate": 2.0953111463323885e-06, "loss": 1.09, "mean_token_accuracy": 0.7307896018028259, "step": 488 }, { "epoch": 0.797716150081566, "grad_norm": 4.828915119171143, "learning_rate": 2.07858976648779e-06, "loss": 1.3271, "mean_token_accuracy": 0.6866028904914856, "step": 489 }, { "epoch": 0.799347471451876, "grad_norm": 5.311947822570801, "learning_rate": 2.061979611158329e-06, "loss": 1.4026, "mean_token_accuracy": 0.6727748513221741, "step": 490 }, { "epoch": 0.800978792822186, "grad_norm": 5.242700576782227, "learning_rate": 2.045481220313298e-06, "loss": 1.3683, "mean_token_accuracy": 0.6764549016952515, "step": 491 }, { "epoch": 0.802610114192496, "grad_norm": 4.709912300109863, "learning_rate": 2.0290951302887117e-06, "loss": 1.1447, "mean_token_accuracy": 0.7429931163787842, "step": 492 }, { "epoch": 0.8042414355628059, "grad_norm": 4.1881184577941895, "learning_rate": 2.0128218737698653e-06, "loss": 1.0764, "mean_token_accuracy": 0.7385087013244629, "step": 493 }, { "epoch": 0.8058727569331158, "grad_norm": 4.042761325836182, "learning_rate": 1.996661979774017e-06, "loss": 1.0007, "mean_token_accuracy": 0.743196427822113, "step": 494 }, { "epoch": 0.8075040783034257, "grad_norm": 4.446390151977539, "learning_rate": 1.9806159736331935e-06, "loss": 1.0239, "mean_token_accuracy": 0.7473176121711731, "step": 495 }, { "epoch": 0.8091353996737357, "grad_norm": 4.78018856048584, "learning_rate": 1.964684376977115e-06, "loss": 1.1063, "mean_token_accuracy": 0.7371134161949158, "step": 496 }, { "epoch": 0.8107667210440457, "grad_norm": 5.604861736297607, "learning_rate": 1.94886770771623e-06, "loss": 1.4752, "mean_token_accuracy": 0.6601036190986633, "step": 497 }, { "epoch": 0.8123980424143556, "grad_norm": 5.058335304260254, "learning_rate": 1.933166480024883e-06, "loss": 1.055, "mean_token_accuracy": 0.7369833588600159, "step": 498 }, { "epoch": 0.8140293637846656, "grad_norm": 4.705621242523193, "learning_rate": 1.9175812043246034e-06, "loss": 1.2298, "mean_token_accuracy": 0.6933262944221497, "step": 499 }, { "epoch": 0.8156606851549756, "grad_norm": 4.777103424072266, "learning_rate": 1.9021123872675062e-06, "loss": 1.1538, "mean_token_accuracy": 0.7174683809280396, "step": 500 }, { "epoch": 0.8172920065252854, "grad_norm": 4.314986705780029, "learning_rate": 1.886760531719825e-06, "loss": 0.9366, "mean_token_accuracy": 0.7647951245307922, "step": 501 }, { "epoch": 0.8189233278955954, "grad_norm": 4.484466075897217, "learning_rate": 1.8715261367455634e-06, "loss": 1.0794, "mean_token_accuracy": 0.744053304195404, "step": 502 }, { "epoch": 0.8205546492659054, "grad_norm": 4.761155605316162, "learning_rate": 1.8564096975902715e-06, "loss": 1.1912, "mean_token_accuracy": 0.7101010084152222, "step": 503 }, { "epoch": 0.8221859706362153, "grad_norm": 5.64600944519043, "learning_rate": 1.8414117056649466e-06, "loss": 1.3092, "mean_token_accuracy": 0.6834645867347717, "step": 504 }, { "epoch": 0.8238172920065253, "grad_norm": 4.866972923278809, "learning_rate": 1.8265326485300582e-06, "loss": 1.0176, "mean_token_accuracy": 0.7384013533592224, "step": 505 }, { "epoch": 0.8254486133768353, "grad_norm": 4.5388994216918945, "learning_rate": 1.8117730098796996e-06, "loss": 1.2966, "mean_token_accuracy": 0.701646089553833, "step": 506 }, { "epoch": 0.8270799347471451, "grad_norm": 4.454381942749023, "learning_rate": 1.7971332695258592e-06, "loss": 1.1112, "mean_token_accuracy": 0.7266221642494202, "step": 507 }, { "epoch": 0.8287112561174551, "grad_norm": 4.481594085693359, "learning_rate": 1.7826139033828263e-06, "loss": 1.2742, "mean_token_accuracy": 0.6912720799446106, "step": 508 }, { "epoch": 0.8303425774877651, "grad_norm": 4.99500036239624, "learning_rate": 1.768215383451723e-06, "loss": 1.1617, "mean_token_accuracy": 0.710889995098114, "step": 509 }, { "epoch": 0.831973898858075, "grad_norm": 4.590748310089111, "learning_rate": 1.7539381778051511e-06, "loss": 1.046, "mean_token_accuracy": 0.7437499761581421, "step": 510 }, { "epoch": 0.833605220228385, "grad_norm": 4.781766414642334, "learning_rate": 1.7397827505719852e-06, "loss": 1.2756, "mean_token_accuracy": 0.6818851232528687, "step": 511 }, { "epoch": 0.835236541598695, "grad_norm": 4.8062744140625, "learning_rate": 1.7257495619222763e-06, "loss": 1.2438, "mean_token_accuracy": 0.6988636255264282, "step": 512 }, { "epoch": 0.8368678629690048, "grad_norm": 4.5913615226745605, "learning_rate": 1.7118390680523023e-06, "loss": 1.1542, "mean_token_accuracy": 0.7089864015579224, "step": 513 }, { "epoch": 0.8384991843393148, "grad_norm": 4.614170551300049, "learning_rate": 1.6980517211697293e-06, "loss": 1.0838, "mean_token_accuracy": 0.7278003692626953, "step": 514 }, { "epoch": 0.8401305057096248, "grad_norm": 4.4173359870910645, "learning_rate": 1.6843879694789095e-06, "loss": 1.1843, "mean_token_accuracy": 0.7148330807685852, "step": 515 }, { "epoch": 0.8417618270799347, "grad_norm": 4.110933303833008, "learning_rate": 1.6708482571663238e-06, "loss": 1.0402, "mean_token_accuracy": 0.7376889586448669, "step": 516 }, { "epoch": 0.8433931484502447, "grad_norm": 4.51687479019165, "learning_rate": 1.657433024386127e-06, "loss": 1.0383, "mean_token_accuracy": 0.7657114267349243, "step": 517 }, { "epoch": 0.8450244698205547, "grad_norm": 5.177441596984863, "learning_rate": 1.6441427072458493e-06, "loss": 1.3209, "mean_token_accuracy": 0.6875, "step": 518 }, { "epoch": 0.8466557911908646, "grad_norm": 4.650432109832764, "learning_rate": 1.630977737792212e-06, "loss": 1.1279, "mean_token_accuracy": 0.7242990732192993, "step": 519 }, { "epoch": 0.8482871125611745, "grad_norm": 4.902032852172852, "learning_rate": 1.6179385439970897e-06, "loss": 1.1124, "mean_token_accuracy": 0.7066738605499268, "step": 520 }, { "epoch": 0.8499184339314845, "grad_norm": 4.605056285858154, "learning_rate": 1.6050255497435902e-06, "loss": 1.0645, "mean_token_accuracy": 0.7346938848495483, "step": 521 }, { "epoch": 0.8515497553017944, "grad_norm": 5.043729305267334, "learning_rate": 1.592239174812279e-06, "loss": 1.3279, "mean_token_accuracy": 0.6896191835403442, "step": 522 }, { "epoch": 0.8531810766721044, "grad_norm": 5.051156520843506, "learning_rate": 1.5795798348675352e-06, "loss": 1.0265, "mean_token_accuracy": 0.7457534074783325, "step": 523 }, { "epoch": 0.8548123980424144, "grad_norm": 4.62628173828125, "learning_rate": 1.5670479414440315e-06, "loss": 1.0211, "mean_token_accuracy": 0.7560975551605225, "step": 524 }, { "epoch": 0.8564437194127243, "grad_norm": 5.277249813079834, "learning_rate": 1.5546439019333632e-06, "loss": 1.3336, "mean_token_accuracy": 0.681064784526825, "step": 525 }, { "epoch": 0.8580750407830342, "grad_norm": 4.982065677642822, "learning_rate": 1.5423681195707997e-06, "loss": 1.4144, "mean_token_accuracy": 0.6686686873435974, "step": 526 }, { "epoch": 0.8597063621533442, "grad_norm": 4.6587605476379395, "learning_rate": 1.5302209934221796e-06, "loss": 1.1911, "mean_token_accuracy": 0.7020725607872009, "step": 527 }, { "epoch": 0.8613376835236541, "grad_norm": 5.415839195251465, "learning_rate": 1.5182029183709345e-06, "loss": 1.3637, "mean_token_accuracy": 0.6866196990013123, "step": 528 }, { "epoch": 0.8629690048939641, "grad_norm": 4.830744743347168, "learning_rate": 1.5063142851052535e-06, "loss": 1.0927, "mean_token_accuracy": 0.7163712382316589, "step": 529 }, { "epoch": 0.8646003262642741, "grad_norm": 4.314631938934326, "learning_rate": 1.4945554801053852e-06, "loss": 1.0773, "mean_token_accuracy": 0.7398513555526733, "step": 530 }, { "epoch": 0.866231647634584, "grad_norm": 4.3542680740356445, "learning_rate": 1.4829268856310677e-06, "loss": 1.1271, "mean_token_accuracy": 0.7248595952987671, "step": 531 }, { "epoch": 0.867862969004894, "grad_norm": 4.48630952835083, "learning_rate": 1.471428879709107e-06, "loss": 1.0675, "mean_token_accuracy": 0.7440000176429749, "step": 532 }, { "epoch": 0.8694942903752039, "grad_norm": 4.849664211273193, "learning_rate": 1.4600618361210857e-06, "loss": 1.2855, "mean_token_accuracy": 0.713458776473999, "step": 533 }, { "epoch": 0.8711256117455138, "grad_norm": 4.989716529846191, "learning_rate": 1.448826124391215e-06, "loss": 1.2499, "mean_token_accuracy": 0.7188649773597717, "step": 534 }, { "epoch": 0.8727569331158238, "grad_norm": 4.539302825927734, "learning_rate": 1.437722109774317e-06, "loss": 1.1633, "mean_token_accuracy": 0.7338669300079346, "step": 535 }, { "epoch": 0.8743882544861338, "grad_norm": 4.66331148147583, "learning_rate": 1.4267501532439526e-06, "loss": 1.2576, "mean_token_accuracy": 0.6965973377227783, "step": 536 }, { "epoch": 0.8760195758564437, "grad_norm": 4.61297607421875, "learning_rate": 1.4159106114806943e-06, "loss": 1.3736, "mean_token_accuracy": 0.6653734445571899, "step": 537 }, { "epoch": 0.8776508972267537, "grad_norm": 4.935201644897461, "learning_rate": 1.4052038368605156e-06, "loss": 1.3792, "mean_token_accuracy": 0.6775679588317871, "step": 538 }, { "epoch": 0.8792822185970636, "grad_norm": 4.569594383239746, "learning_rate": 1.3946301774433502e-06, "loss": 1.105, "mean_token_accuracy": 0.7271789312362671, "step": 539 }, { "epoch": 0.8809135399673735, "grad_norm": 4.568352699279785, "learning_rate": 1.3841899769617723e-06, "loss": 1.1148, "mean_token_accuracy": 0.7321231961250305, "step": 540 }, { "epoch": 0.8825448613376835, "grad_norm": 5.049271583557129, "learning_rate": 1.3738835748098198e-06, "loss": 1.0984, "mean_token_accuracy": 0.7366254925727844, "step": 541 }, { "epoch": 0.8841761827079935, "grad_norm": 5.136232376098633, "learning_rate": 1.3637113060319629e-06, "loss": 1.2849, "mean_token_accuracy": 0.6897223591804504, "step": 542 }, { "epoch": 0.8858075040783034, "grad_norm": 4.453695774078369, "learning_rate": 1.3536735013122144e-06, "loss": 1.0962, "mean_token_accuracy": 0.7319535613059998, "step": 543 }, { "epoch": 0.8874388254486134, "grad_norm": 4.621738910675049, "learning_rate": 1.3437704869633772e-06, "loss": 1.0924, "mean_token_accuracy": 0.7451643943786621, "step": 544 }, { "epoch": 0.8890701468189234, "grad_norm": 4.363915920257568, "learning_rate": 1.334002584916437e-06, "loss": 1.2547, "mean_token_accuracy": 0.6975655555725098, "step": 545 }, { "epoch": 0.8907014681892332, "grad_norm": 4.77221155166626, "learning_rate": 1.3243701127100971e-06, "loss": 1.1272, "mean_token_accuracy": 0.732022762298584, "step": 546 }, { "epoch": 0.8923327895595432, "grad_norm": 4.910726070404053, "learning_rate": 1.314873383480455e-06, "loss": 1.1381, "mean_token_accuracy": 0.7128287553787231, "step": 547 }, { "epoch": 0.8939641109298532, "grad_norm": 4.650912284851074, "learning_rate": 1.3055127059508257e-06, "loss": 1.0727, "mean_token_accuracy": 0.7480478882789612, "step": 548 }, { "epoch": 0.8955954323001631, "grad_norm": 3.9856724739074707, "learning_rate": 1.2962883844217e-06, "loss": 0.8759, "mean_token_accuracy": 0.7789642214775085, "step": 549 }, { "epoch": 0.8972267536704731, "grad_norm": 4.78012752532959, "learning_rate": 1.287200718760859e-06, "loss": 1.2732, "mean_token_accuracy": 0.6914893388748169, "step": 550 }, { "epoch": 0.8988580750407831, "grad_norm": 4.302763938903809, "learning_rate": 1.27825000439362e-06, "loss": 1.0871, "mean_token_accuracy": 0.7311992049217224, "step": 551 }, { "epoch": 0.9004893964110929, "grad_norm": 4.6384100914001465, "learning_rate": 1.2694365322932365e-06, "loss": 1.3448, "mean_token_accuracy": 0.6719226837158203, "step": 552 }, { "epoch": 0.9021207177814029, "grad_norm": 4.745211124420166, "learning_rate": 1.2607605889714359e-06, "loss": 1.19, "mean_token_accuracy": 0.7090080976486206, "step": 553 }, { "epoch": 0.9037520391517129, "grad_norm": 4.419302940368652, "learning_rate": 1.252222456469111e-06, "loss": 0.9335, "mean_token_accuracy": 0.774678111076355, "step": 554 }, { "epoch": 0.9053833605220228, "grad_norm": 5.066204071044922, "learning_rate": 1.2438224123471442e-06, "loss": 1.3473, "mean_token_accuracy": 0.6653266549110413, "step": 555 }, { "epoch": 0.9070146818923328, "grad_norm": 4.375471115112305, "learning_rate": 1.2355607296773896e-06, "loss": 1.2947, "mean_token_accuracy": 0.6962790489196777, "step": 556 }, { "epoch": 0.9086460032626428, "grad_norm": 5.035999774932861, "learning_rate": 1.2274376770337925e-06, "loss": 1.1271, "mean_token_accuracy": 0.7255297899246216, "step": 557 }, { "epoch": 0.9102773246329527, "grad_norm": 4.534280776977539, "learning_rate": 1.2194535184836633e-06, "loss": 1.1659, "mean_token_accuracy": 0.7146624326705933, "step": 558 }, { "epoch": 0.9119086460032626, "grad_norm": 4.192361354827881, "learning_rate": 1.2116085135790872e-06, "loss": 0.9654, "mean_token_accuracy": 0.7518177628517151, "step": 559 }, { "epoch": 0.9135399673735726, "grad_norm": 5.638926982879639, "learning_rate": 1.2039029173484892e-06, "loss": 1.6001, "mean_token_accuracy": 0.6247368454933167, "step": 560 }, { "epoch": 0.9151712887438825, "grad_norm": 4.600732326507568, "learning_rate": 1.1963369802883478e-06, "loss": 1.2123, "mean_token_accuracy": 0.7063252925872803, "step": 561 }, { "epoch": 0.9168026101141925, "grad_norm": 4.525058746337891, "learning_rate": 1.1889109483550411e-06, "loss": 1.0932, "mean_token_accuracy": 0.7251037359237671, "step": 562 }, { "epoch": 0.9184339314845025, "grad_norm": 4.5724005699157715, "learning_rate": 1.1816250629568632e-06, "loss": 1.0861, "mean_token_accuracy": 0.7240241765975952, "step": 563 }, { "epoch": 0.9200652528548124, "grad_norm": 5.578955173492432, "learning_rate": 1.1744795609461683e-06, "loss": 1.2629, "mean_token_accuracy": 0.6909705400466919, "step": 564 }, { "epoch": 0.9216965742251223, "grad_norm": 5.318408489227295, "learning_rate": 1.167474674611675e-06, "loss": 1.0538, "mean_token_accuracy": 0.7338252067565918, "step": 565 }, { "epoch": 0.9233278955954323, "grad_norm": 4.251341342926025, "learning_rate": 1.1606106316709122e-06, "loss": 1.0875, "mean_token_accuracy": 0.7354260087013245, "step": 566 }, { "epoch": 0.9249592169657422, "grad_norm": 5.110576629638672, "learning_rate": 1.1538876552628183e-06, "loss": 1.1861, "mean_token_accuracy": 0.7216981053352356, "step": 567 }, { "epoch": 0.9265905383360522, "grad_norm": 4.9769721031188965, "learning_rate": 1.147305963940488e-06, "loss": 1.0369, "mean_token_accuracy": 0.744041919708252, "step": 568 }, { "epoch": 0.9282218597063622, "grad_norm": 5.02736759185791, "learning_rate": 1.1408657716640643e-06, "loss": 1.5051, "mean_token_accuracy": 0.6656504273414612, "step": 569 }, { "epoch": 0.9298531810766721, "grad_norm": 4.389795303344727, "learning_rate": 1.134567287793787e-06, "loss": 1.1081, "mean_token_accuracy": 0.7329843044281006, "step": 570 }, { "epoch": 0.9314845024469821, "grad_norm": 4.3082427978515625, "learning_rate": 1.128410717083182e-06, "loss": 1.0839, "mean_token_accuracy": 0.729187548160553, "step": 571 }, { "epoch": 0.933115823817292, "grad_norm": 5.201175212860107, "learning_rate": 1.1223962596724115e-06, "loss": 1.2717, "mean_token_accuracy": 0.6742309927940369, "step": 572 }, { "epoch": 0.9347471451876019, "grad_norm": 4.306964874267578, "learning_rate": 1.1165241110817602e-06, "loss": 1.1214, "mean_token_accuracy": 0.721930742263794, "step": 573 }, { "epoch": 0.9363784665579119, "grad_norm": 4.683149814605713, "learning_rate": 1.1107944622052857e-06, "loss": 1.2618, "mean_token_accuracy": 0.6971399188041687, "step": 574 }, { "epoch": 0.9380097879282219, "grad_norm": 5.620746612548828, "learning_rate": 1.1052074993046102e-06, "loss": 1.2447, "mean_token_accuracy": 0.6849538087844849, "step": 575 }, { "epoch": 0.9396411092985318, "grad_norm": 4.673566818237305, "learning_rate": 1.0997634040028643e-06, "loss": 1.1948, "mean_token_accuracy": 0.7181038856506348, "step": 576 }, { "epoch": 0.9412724306688418, "grad_norm": 4.916784763336182, "learning_rate": 1.0944623532787844e-06, "loss": 1.0561, "mean_token_accuracy": 0.7292323708534241, "step": 577 }, { "epoch": 0.9429037520391517, "grad_norm": 4.703412055969238, "learning_rate": 1.0893045194609596e-06, "loss": 1.1676, "mean_token_accuracy": 0.7098938226699829, "step": 578 }, { "epoch": 0.9445350734094616, "grad_norm": 4.568572521209717, "learning_rate": 1.0842900702222283e-06, "loss": 1.3776, "mean_token_accuracy": 0.6739327907562256, "step": 579 }, { "epoch": 0.9461663947797716, "grad_norm": 4.693262577056885, "learning_rate": 1.0794191685742276e-06, "loss": 1.3097, "mean_token_accuracy": 0.6928605437278748, "step": 580 }, { "epoch": 0.9477977161500816, "grad_norm": 4.470661640167236, "learning_rate": 1.074691972862095e-06, "loss": 1.0411, "mean_token_accuracy": 0.7344121932983398, "step": 581 }, { "epoch": 0.9494290375203915, "grad_norm": 4.835824966430664, "learning_rate": 1.070108636759322e-06, "loss": 1.037, "mean_token_accuracy": 0.7305210828781128, "step": 582 }, { "epoch": 0.9510603588907015, "grad_norm": 5.557530403137207, "learning_rate": 1.0656693092627534e-06, "loss": 1.4262, "mean_token_accuracy": 0.6733524203300476, "step": 583 }, { "epoch": 0.9526916802610114, "grad_norm": 5.196942329406738, "learning_rate": 1.0613741346877498e-06, "loss": 1.0261, "mean_token_accuracy": 0.7382140755653381, "step": 584 }, { "epoch": 0.9543230016313213, "grad_norm": 4.611567497253418, "learning_rate": 1.0572232526634918e-06, "loss": 1.1281, "mean_token_accuracy": 0.7303561568260193, "step": 585 }, { "epoch": 0.9559543230016313, "grad_norm": 4.973482608795166, "learning_rate": 1.0532167981284437e-06, "loss": 1.1927, "mean_token_accuracy": 0.7078921794891357, "step": 586 }, { "epoch": 0.9575856443719413, "grad_norm": 4.535336017608643, "learning_rate": 1.0493549013259644e-06, "loss": 1.1746, "mean_token_accuracy": 0.736328125, "step": 587 }, { "epoch": 0.9592169657422512, "grad_norm": 4.868354320526123, "learning_rate": 1.0456376878000754e-06, "loss": 1.1741, "mean_token_accuracy": 0.7153007984161377, "step": 588 }, { "epoch": 0.9608482871125612, "grad_norm": 4.772627353668213, "learning_rate": 1.0420652783913794e-06, "loss": 1.2043, "mean_token_accuracy": 0.7127602696418762, "step": 589 }, { "epoch": 0.9624796084828712, "grad_norm": 4.772705554962158, "learning_rate": 1.03863778923313e-06, "loss": 1.2719, "mean_token_accuracy": 0.6881773471832275, "step": 590 }, { "epoch": 0.964110929853181, "grad_norm": 4.778547286987305, "learning_rate": 1.0353553317474574e-06, "loss": 1.0815, "mean_token_accuracy": 0.7438063025474548, "step": 591 }, { "epoch": 0.965742251223491, "grad_norm": 4.736347198486328, "learning_rate": 1.0322180126417494e-06, "loss": 1.1622, "mean_token_accuracy": 0.7216761708259583, "step": 592 }, { "epoch": 0.967373572593801, "grad_norm": 4.148738384246826, "learning_rate": 1.0292259339051769e-06, "loss": 1.1596, "mean_token_accuracy": 0.7182163000106812, "step": 593 }, { "epoch": 0.9690048939641109, "grad_norm": 4.727193832397461, "learning_rate": 1.026379192805382e-06, "loss": 1.4, "mean_token_accuracy": 0.6754344701766968, "step": 594 }, { "epoch": 0.9706362153344209, "grad_norm": 4.908797264099121, "learning_rate": 1.0236778818853158e-06, "loss": 1.3418, "mean_token_accuracy": 0.6792452931404114, "step": 595 }, { "epoch": 0.9722675367047309, "grad_norm": 5.056847095489502, "learning_rate": 1.0211220889602289e-06, "loss": 1.1988, "mean_token_accuracy": 0.715332567691803, "step": 596 }, { "epoch": 0.9738988580750407, "grad_norm": 4.906404495239258, "learning_rate": 1.018711897114817e-06, "loss": 1.3387, "mean_token_accuracy": 0.6841831207275391, "step": 597 }, { "epoch": 0.9755301794453507, "grad_norm": 4.806992530822754, "learning_rate": 1.0164473847005205e-06, "loss": 1.2102, "mean_token_accuracy": 0.7100494503974915, "step": 598 }, { "epoch": 0.9771615008156607, "grad_norm": 4.936591148376465, "learning_rate": 1.0143286253329769e-06, "loss": 1.1404, "mean_token_accuracy": 0.7201149463653564, "step": 599 }, { "epoch": 0.9787928221859706, "grad_norm": 4.549412727355957, "learning_rate": 1.0123556878896274e-06, "loss": 1.2092, "mean_token_accuracy": 0.7039577960968018, "step": 600 }, { "epoch": 0.9804241435562806, "grad_norm": 4.218964576721191, "learning_rate": 1.0105286365074788e-06, "loss": 0.9088, "mean_token_accuracy": 0.775624692440033, "step": 601 }, { "epoch": 0.9820554649265906, "grad_norm": 5.200554847717285, "learning_rate": 1.0088475305810178e-06, "loss": 1.1501, "mean_token_accuracy": 0.7204116582870483, "step": 602 }, { "epoch": 0.9836867862969005, "grad_norm": 4.525951385498047, "learning_rate": 1.0073124247602805e-06, "loss": 1.1539, "mean_token_accuracy": 0.7239478826522827, "step": 603 }, { "epoch": 0.9853181076672104, "grad_norm": 4.843019008636475, "learning_rate": 1.0059233689490742e-06, "loss": 1.3085, "mean_token_accuracy": 0.6880208253860474, "step": 604 }, { "epoch": 0.9869494290375204, "grad_norm": 4.720979690551758, "learning_rate": 1.0046804083033585e-06, "loss": 0.753, "mean_token_accuracy": 0.8125383853912354, "step": 605 }, { "epoch": 0.9885807504078303, "grad_norm": 4.803732395172119, "learning_rate": 1.0035835832297736e-06, "loss": 1.1941, "mean_token_accuracy": 0.6987314820289612, "step": 606 }, { "epoch": 0.9902120717781403, "grad_norm": 4.542924880981445, "learning_rate": 1.00263292938433e-06, "loss": 1.0593, "mean_token_accuracy": 0.730215847492218, "step": 607 }, { "epoch": 0.9918433931484503, "grad_norm": 4.79932975769043, "learning_rate": 1.0018284776712475e-06, "loss": 1.3496, "mean_token_accuracy": 0.6967418789863586, "step": 608 }, { "epoch": 0.9934747145187602, "grad_norm": 4.847198963165283, "learning_rate": 1.0011702542419498e-06, "loss": 1.1661, "mean_token_accuracy": 0.7184059023857117, "step": 609 }, { "epoch": 0.9951060358890701, "grad_norm": 5.3224406242370605, "learning_rate": 1.0006582804942171e-06, "loss": 1.2955, "mean_token_accuracy": 0.6828246712684631, "step": 610 }, { "epoch": 0.9967373572593801, "grad_norm": 4.992889404296875, "learning_rate": 1.000292573071488e-06, "loss": 1.2474, "mean_token_accuracy": 0.7027914524078369, "step": 611 }, { "epoch": 0.99836867862969, "grad_norm": 5.054308891296387, "learning_rate": 1.000073143862319e-06, "loss": 1.1465, "mean_token_accuracy": 0.7231833934783936, "step": 612 }, { "epoch": 1.0, "grad_norm": 5.045431613922119, "learning_rate": 1.0000000000000002e-06, "loss": 1.0511, "mean_token_accuracy": 0.7292899489402771, "step": 613 }, { "epoch": 1.0, "step": 613, "total_flos": 1.770321796592042e+18, "train_loss": 1.3923614178746209, "train_runtime": 2541.2937, "train_samples_per_second": 7.707, "train_steps_per_second": 0.241 } ], "logging_steps": 1, "max_steps": 613, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.770321796592042e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }